mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Patch a bug in fasta and fastq reading
Former-commit-id: bcaa264b4c4a7c67617eb909b199176bf09913db
This commit is contained in:
@ -1,9 +1,12 @@
|
||||
package obichunk
|
||||
|
||||
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
type __options__ struct {
|
||||
statsOn []string
|
||||
statsOn obiseq.StatsOnDescriptions
|
||||
categories []string
|
||||
navalue string
|
||||
cacheOnDisk bool
|
||||
@ -21,7 +24,7 @@ type WithOption func(Options)
|
||||
|
||||
func MakeOptions(setters []WithOption) Options {
|
||||
o := __options__{
|
||||
statsOn: make([]string, 0, 100),
|
||||
statsOn: make(obiseq.StatsOnDescriptions, 10),
|
||||
categories: make([]string, 0, 100),
|
||||
navalue: "NA",
|
||||
cacheOnDisk: false,
|
||||
@ -53,7 +56,7 @@ func (opt Options) PopCategories() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (opt Options) StatsOn() []string {
|
||||
func (opt Options) StatsOn() obiseq.StatsOnDescriptions {
|
||||
return opt.pointer.statsOn
|
||||
}
|
||||
|
||||
@ -114,7 +117,10 @@ func OptionNAValue(na string) WithOption {
|
||||
|
||||
func OptionStatOn(keys ...string) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.statsOn = append(opt.pointer.categories, keys...)
|
||||
for _, k := range keys {
|
||||
d := obiseq.MakeStatsOnDescription(k)
|
||||
opt.pointer.statsOn[d.Name] = d
|
||||
}
|
||||
})
|
||||
|
||||
return f
|
||||
|
@ -50,13 +50,12 @@ func _ParseFastaFile(source string,
|
||||
var identifier string
|
||||
var definition string
|
||||
|
||||
state := 0
|
||||
|
||||
idBytes := new(bytes.Buffer)
|
||||
defBytes := new(bytes.Buffer)
|
||||
seqBytes := new(bytes.Buffer)
|
||||
idBytes := bytes.Buffer{}
|
||||
defBytes := bytes.Buffer{}
|
||||
seqBytes := bytes.Buffer{}
|
||||
|
||||
for chunks := range input {
|
||||
state := 0
|
||||
scanner := bufio.NewReader(chunks.raw)
|
||||
start, _ := scanner.Peek(20)
|
||||
if start[0] != '>' {
|
||||
|
@ -129,14 +129,13 @@ func _ParseFastqFile(source string,
|
||||
var identifier string
|
||||
var definition string
|
||||
|
||||
state := 0
|
||||
|
||||
idBytes := new(bytes.Buffer)
|
||||
defBytes := new(bytes.Buffer)
|
||||
qualBytes := new(bytes.Buffer)
|
||||
seqBytes := new(bytes.Buffer)
|
||||
idBytes := bytes.Buffer{}
|
||||
defBytes := bytes.Buffer{}
|
||||
qualBytes := bytes.Buffer{}
|
||||
seqBytes := bytes.Buffer{}
|
||||
|
||||
for chunks := range input {
|
||||
state := 0
|
||||
scanner := bufio.NewReader(chunks.raw)
|
||||
sequences := make(obiseq.BioSequenceSlice, 0, 100)
|
||||
previous := byte(0)
|
||||
@ -257,7 +256,7 @@ func _ParseFastqFile(source string,
|
||||
}
|
||||
case 10:
|
||||
if is_end_of_line {
|
||||
_storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift)
|
||||
_storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift)
|
||||
|
||||
if no_order {
|
||||
if len(sequences) == batch_size {
|
||||
@ -286,7 +285,7 @@ func _ParseFastqFile(source string,
|
||||
|
||||
if len(sequences) > 0 {
|
||||
if state == 10 {
|
||||
_storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift)
|
||||
_storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift)
|
||||
state = 1
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,7 @@ package obiiter
|
||||
|
||||
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
|
||||
func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn []string, sizes ...int) IBioSequence {
|
||||
func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn obiseq.StatsOnDescriptions, sizes ...int) IBioSequence {
|
||||
batchsize := 100
|
||||
|
||||
if len(sizes) > 0 {
|
||||
@ -36,7 +36,7 @@ func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn []string, si
|
||||
return newIter
|
||||
}
|
||||
|
||||
func MergePipe(na string, statsOn []string, sizes ...int) Pipeable {
|
||||
func MergePipe(na string, statsOn obiseq.StatsOnDescriptions, sizes ...int) Pipeable {
|
||||
f := func(iterator IBioSequence) IBioSequence {
|
||||
return iterator.IMergeSequenceBatch(na, statsOn, sizes...)
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ import (
|
||||
// TODO: The version number is extracted from git. This induces that the version
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
var _Commit = "58bcc67"
|
||||
var _Commit = "612868a"
|
||||
var _Version = "Release 4.2.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
|
@ -11,6 +11,43 @@ import (
|
||||
)
|
||||
|
||||
type StatsOnValues map[string]int
|
||||
type StatsOnWeights func(sequence *BioSequence) int
|
||||
type StatsOnDescription struct {
|
||||
Name string
|
||||
Key string
|
||||
Weight StatsOnWeights
|
||||
}
|
||||
type StatsOnDescriptions map[string]StatsOnDescription
|
||||
|
||||
func BioseqCount(sequence *BioSequence) int {
|
||||
return sequence.Count()
|
||||
}
|
||||
|
||||
func MakeStatsOnDescription(descriptor string) StatsOnDescription {
|
||||
parts := strings.SplitN(descriptor, ":", 2)
|
||||
var ff StatsOnWeights
|
||||
switch len(parts) {
|
||||
case 1:
|
||||
ff = func(s *BioSequence) int {
|
||||
return s.Count()
|
||||
}
|
||||
|
||||
case 2:
|
||||
ff = func(s *BioSequence) int {
|
||||
v, ok := s.GetIntAttribute(parts[1])
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
return v
|
||||
}
|
||||
}
|
||||
|
||||
return StatsOnDescription{
|
||||
Name: descriptor,
|
||||
Key: parts[0],
|
||||
Weight: ff,
|
||||
}
|
||||
}
|
||||
|
||||
// StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute.
|
||||
//
|
||||
@ -50,8 +87,8 @@ func (sequence *BioSequence) HasStatsOn(key string) bool {
|
||||
//
|
||||
// Return type:
|
||||
// - StatsOnValues
|
||||
func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
|
||||
mkey := StatsOnSlotName(key)
|
||||
func (sequence *BioSequence) StatsOn(desc StatsOnDescription, na string) StatsOnValues {
|
||||
mkey := StatsOnSlotName(desc.Name)
|
||||
annotations := sequence.Annotations()
|
||||
istat, ok := annotations[mkey]
|
||||
|
||||
@ -88,8 +125,8 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
|
||||
newstat = true
|
||||
}
|
||||
|
||||
if newstat && sequence.StatsPlusOne(key, sequence, na) {
|
||||
delete(sequence.Annotations(), key)
|
||||
if newstat && sequence.StatsPlusOne(desc, sequence, na) {
|
||||
delete(sequence.Annotations(), desc.Key)
|
||||
}
|
||||
|
||||
return stats
|
||||
@ -103,14 +140,14 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
|
||||
// - na: the value to be used if the attribute is not present
|
||||
// Return type:
|
||||
// - bool
|
||||
func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na string) bool {
|
||||
func (sequence *BioSequence) StatsPlusOne(desc StatsOnDescription, toAdd *BioSequence, na string) bool {
|
||||
sval := na
|
||||
annotations := sequence.Annotations()
|
||||
stats := sequence.StatsOn(key, na)
|
||||
stats := sequence.StatsOn(desc, na)
|
||||
retval := false
|
||||
|
||||
if toAdd.HasAnnotation() {
|
||||
value, ok := toAdd.Annotations()[key]
|
||||
value, ok := toAdd.Annotations()[desc.Key]
|
||||
|
||||
if ok {
|
||||
|
||||
@ -139,8 +176,8 @@ func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na str
|
||||
if !ok {
|
||||
old = 0
|
||||
}
|
||||
stats[sval] = old + toAdd.Count()
|
||||
annotations[StatsOnSlotName(key)] = stats // TODO: check if this is necessary
|
||||
stats[sval] = old + desc.Weight(toAdd)
|
||||
annotations[StatsOnSlotName(desc.Name)] = stats // TODO: check if this is necessary
|
||||
return retval
|
||||
}
|
||||
|
||||
@ -170,7 +207,7 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
|
||||
//
|
||||
// Return type:
|
||||
// - *BioSequence: the merged sequence (BioSequence)
|
||||
func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn ...string) *BioSequence {
|
||||
func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn StatsOnDescriptions) *BioSequence {
|
||||
if !inplace {
|
||||
sequence = sequence.Copy()
|
||||
}
|
||||
@ -183,14 +220,14 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
|
||||
|
||||
count := sequence.Count() + tomerge.Count()
|
||||
|
||||
for _, key := range statsOn {
|
||||
for key, desc := range statsOn {
|
||||
if tomerge.HasStatsOn(key) {
|
||||
smk := sequence.StatsOn(key, na)
|
||||
mmk := tomerge.StatsOn(key, na)
|
||||
smk := sequence.StatsOn(desc, na)
|
||||
mmk := tomerge.StatsOn(desc, na)
|
||||
|
||||
annotations[StatsOnSlotName(key)] = smk.Merge(mmk)
|
||||
} else {
|
||||
sequence.StatsPlusOne(key, tomerge, na)
|
||||
sequence.StatsPlusOne(desc, tomerge, na)
|
||||
}
|
||||
}
|
||||
|
||||
@ -237,19 +274,19 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
|
||||
//
|
||||
// Return type:
|
||||
// - *BioSequence: the merged sequence (BioSequence)
|
||||
func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequence {
|
||||
func (sequences BioSequenceSlice) Merge(na string, statsOn StatsOnDescriptions) *BioSequence {
|
||||
seq := sequences[0]
|
||||
//sequences[0] = nil
|
||||
seq.SetQualities(nil)
|
||||
|
||||
if len(sequences) == 1 {
|
||||
seq.Annotations()["count"] = seq.Count()
|
||||
for _, v := range statsOn {
|
||||
seq.StatsOn(v, na)
|
||||
for _, desc := range statsOn {
|
||||
seq.StatsOn(desc, na)
|
||||
}
|
||||
} else {
|
||||
for k, toMerge := range sequences[1:] {
|
||||
seq.Merge(toMerge, na, true, statsOn...)
|
||||
seq.Merge(toMerge, na, true, statsOn)
|
||||
toMerge.Recycle()
|
||||
sequences[1+k] = nil
|
||||
}
|
||||
|
@ -43,7 +43,7 @@ func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) {
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) TaxonomicDistribution(sequence *obiseq.BioSequence) map[*TaxNode]int {
|
||||
taxids := sequence.StatsOn("taxid", "na")
|
||||
taxids := sequence.StatsOn(obiseq.MakeStatsOnDescription("taxid"), "na")
|
||||
taxons := make(map[*TaxNode]int, len(taxids))
|
||||
|
||||
for k, v := range taxids {
|
||||
|
@ -34,7 +34,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
|
||||
samples := make(map[string]*([]*seqPCR))
|
||||
|
||||
for _, s := range dataset {
|
||||
stats := s.StatsOn(tag, NAValue)
|
||||
stats := s.StatsOn(obiseq.MakeStatsOnDescription(tag), NAValue)
|
||||
|
||||
for k, v := range stats {
|
||||
pcr, ok := samples[k]
|
||||
|
@ -129,7 +129,7 @@ func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func
|
||||
|
||||
f := func(i int) float64 {
|
||||
|
||||
stats := (*seqs)[i].StatsOn(sample_key, "NA")
|
||||
stats := (*seqs)[i].StatsOn(obiseq.MakeStatsOnDescription(sample_key), "NA")
|
||||
|
||||
if value, ok := stats[sample]; ok {
|
||||
return float64(value)
|
||||
@ -155,7 +155,7 @@ func SeqBySamples(seqs obiseq.BioSequenceSlice, sample_key string) map[string]*o
|
||||
|
||||
for _, s := range seqs {
|
||||
if s.HasStatsOn(sample_key) {
|
||||
stats := s.StatsOn(sample_key, "NA")
|
||||
stats := s.StatsOn(obiseq.MakeStatsOnDescription(sample_key), "NA")
|
||||
for k := range stats {
|
||||
if seqset, ok := samples[k]; ok {
|
||||
*seqset = append(*seqset, s)
|
||||
@ -378,7 +378,7 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
}()
|
||||
|
||||
obiuniq.AddStatsOn(CLISampleAttribute())
|
||||
obiuniq.AddStatsOn("obiconsensus_weight")
|
||||
obiuniq.AddStatsOn("sample:obiconsensus_weight")
|
||||
obiuniq.SetUniqueInMemory(false)
|
||||
obiuniq.SetNoSingleton(CLINoSingleton())
|
||||
return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
|
||||
|
Reference in New Issue
Block a user