From 576a9f4d2dc8aa90761f8970c8c578841b36a50b Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sun, 16 Jan 2022 00:21:42 +0100 Subject: [PATCH] A global version of a Slice pool --- cmd/obitools/obipairing/main.go | 24 ++--- pkg/obialign/alignment.go | 54 +++++------ pkg/obialign/pairedendalign.go | 11 ++- pkg/obiformats/embl_read.go | 2 +- pkg/obiformats/fastseq_read.go | 18 ++-- pkg/obiseq/biosequence.go | 150 +++++++++++++++++------------ pkg/obiseq/biosequenceslice.go | 4 + pkg/obiseq/pool.go | 84 ++++++++++++---- pkg/obiseq/revcomp.go | 4 +- pkg/obiseq/subseq.go | 21 ++-- pkg/obitools/obipairing/pairing.go | 4 +- 11 files changed, 227 insertions(+), 149 deletions(-) create mode 100644 pkg/obiseq/biosequenceslice.go diff --git a/cmd/obitools/obipairing/main.go b/cmd/obitools/obipairing/main.go index b8d1f37..e11bcd9 100644 --- a/cmd/obitools/obipairing/main.go +++ b/cmd/obitools/obipairing/main.go @@ -3,7 +3,7 @@ package main import ( "log" "os" - "runtime/trace" + "runtime/pprof" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" @@ -13,20 +13,20 @@ import ( func main() { // go tool pprof -http=":8000" ./obipairing ./cpu.pprof - // f, err := os.Create("cpu.pprof") - // if err != nil { - // log.Fatal(err) - // } - // pprof.StartCPUProfile(f) - // defer pprof.StopCPUProfile() - - // go tool trace cpu.trace - ftrace, err := os.Create("cpu.trace") + f, err := os.Create("cpu.pprof") if err != nil { log.Fatal(err) } - trace.Start(ftrace) - defer trace.Stop() + pprof.StartCPUProfile(f) + defer pprof.StopCPUProfile() + + // go tool trace cpu.trace + // ftrace, err := os.Create("cpu.trace") + // if err != nil { + // log.Fatal(err) + // } + // trace.Start(ftrace) + // defer trace.Stop() optionParser := obioptions.GenerateOptionParser(obipairing.OptionSet) diff --git a/pkg/obialign/alignment.go b/pkg/obialign/alignment.go index b40ee39..6bf507b 100644 --- a/pkg/obialign/alignment.go +++ b/pkg/obialign/alignment.go @@ -68,22 +68,22 @@ func _BuildAlignment(seqA, seqB []byte, path []int, gap byte, bufferA, bufferB * func BuildAlignment(seqA, seqB obiseq.BioSequence, path []int, gap byte) (obiseq.BioSequence, obiseq.BioSequence) { - bufferSA := _BuildAlignArenaPool.Get().(*[]byte) - defer _BuildAlignArenaPool.Put(bufferSA) + bufferSA := obiseq.GetSlice() + defer obiseq.RecycleSlice(bufferSA) - bufferSB := _BuildAlignArenaPool.Get().(*[]byte) - defer _BuildAlignArenaPool.Put(bufferSB) + bufferSB := obiseq.GetSlice() + defer obiseq.RecycleSlice(bufferSB) _BuildAlignment(seqA.Sequence(), seqB.Sequence(), path, gap, - bufferSA, - bufferSB) + &bufferSA, + &bufferSB) seqA = obiseq.MakeBioSequence(seqA.Id(), - *bufferSA, + bufferSA, seqA.Definition()) seqB = obiseq.MakeBioSequence(seqB.Id(), - *bufferSB, + bufferSB, seqB.Definition()) return seqA, seqB @@ -112,27 +112,23 @@ func BuildAlignment(seqA, seqB obiseq.BioSequence, // return. func BuildQualityConsensus(seqA, seqB obiseq.BioSequence, path []int) (obiseq.BioSequence, int) { - bufferSA := _BuildAlignArenaPool.Get().(*[]byte) - defer _BuildAlignArenaPool.Put(bufferSA) + bufferSA := obiseq.GetSlice() + bufferSB := obiseq.GetSlice() + defer obiseq.RecycleSlice(bufferSB) - bufferSB := _BuildAlignArenaPool.Get().(*[]byte) - defer _BuildAlignArenaPool.Put(bufferSB) - - bufferQA := _BuildAlignArenaPool.Get().(*[]byte) - defer _BuildAlignArenaPool.Put(bufferQA) - - bufferQB := _BuildAlignArenaPool.Get().(*[]byte) - defer _BuildAlignArenaPool.Put(bufferQB) + bufferQA := obiseq.GetSlice() + bufferQB := obiseq.GetSlice() + defer obiseq.RecycleSlice(bufferQB) _BuildAlignment(seqA.Sequence(), seqB.Sequence(), path, ' ', - bufferSA, bufferSB) + &bufferSA, &bufferSB) // log.Printf("#1 %s--> la : %d,%p lb : %d,%p qa : %d,%p qb : %d,%p\n", stamp, // len(*bufferSA), bufferSA, len(*bufferSB), bufferSB, // len(*bufferQA), bufferQA, len(*bufferQB), bufferQB) _BuildAlignment(seqA.Qualities(), seqB.Qualities(), path, byte(0), - bufferQA, bufferQB) + &bufferQA, &bufferQB) // log.Printf("#2 %s--> la : %d,%p lb : %d,%p qa : %d,%p qb : %d,%p\n", stamp, // len(*bufferSA), bufferSA, len(*bufferSB), bufferSB, @@ -145,23 +141,23 @@ func BuildQualityConsensus(seqA, seqB obiseq.BioSequence, path []int) (obiseq.Bi match := 0 - for i, qA = range *bufferQA { - nA := (*bufferSA)[i] - nB := (*bufferSB)[i] - qB = (*bufferQB)[i] + for i, qA = range bufferQA { + nA := bufferSA[i] + nB := bufferSB[i] + qB = bufferQB[i] if qA > qB { qM = qA qm = qB } if qB > qA { - (*bufferSA)[i] = (*bufferSB)[i] + bufferSA[i] = bufferSB[i] qM = qB qm = qA } if qB == qA && nA != nB { nuc := _FourBitsBaseCode[nA&31] | _FourBitsBaseCode[nB&31] - (*bufferSA)[i] = _FourBitsBaseDecode[nuc] + bufferSA[i] = _FourBitsBaseDecode[nuc] } q := qA + qB @@ -179,15 +175,15 @@ func BuildQualityConsensus(seqA, seqB obiseq.BioSequence, path []int) (obiseq.Bi q = 90 } - (*bufferQA)[i] = q + bufferQA[i] = q } consSeq := obiseq.MakeBioSequence( seqA.Id(), - (*bufferSA), + bufferSA, seqA.Definition(), ) - consSeq.SetSequence((*bufferQA)) + consSeq.SetQualities(bufferQA) return consSeq, match } diff --git a/pkg/obialign/pairedendalign.go b/pkg/obialign/pairedendalign.go index 1377903..33696e0 100644 --- a/pkg/obialign/pairedendalign.go +++ b/pkg/obialign/pairedendalign.go @@ -281,11 +281,6 @@ func PEAlign(seqA, seqB obiseq.BioSequence, _InitDNAScoreMatrix() } - // log.Println("==============") - // log.Println(seqA.String()) - // log.Println(seqB.String()) - // log.Println("--------------") - index := obikmer.Index4mer(seqA, &arena.pointer.fastIndex, &arena.pointer.fastBuffer) @@ -303,6 +298,9 @@ func PEAlign(seqA, seqB obiseq.BioSequence, // log.Printf("Shift : %d Score : %d Over : %d La : %d:%d Lb: %d:%d\n", shift, fastScore, over, seqA.Length(), len(seqA.Qualities()), seqB.Length(), len(seqB.Qualities())) if fastScore+3 < over { + + // At least one mismatch exists in the overlaping region + if shift > 0 { startA = shift - delta if startA < 0 { @@ -321,6 +319,9 @@ func PEAlign(seqA, seqB obiseq.BioSequence, &arena.pointer.scoreMatrix, &arena.pointer.pathMatrix) } else { + + // Both overlaping regions are identicals + startA = 0 startB = -shift - delta if startB < 0 { diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index bf3eaac..16efa66 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -128,7 +128,7 @@ func _ParseEmblFile(input <-chan _FileChunk, out obiseq.IBioSequenceBatch) { seqBytes.Bytes(), defBytes.String()) - sequence.SetFeatures(featBytes.String()) + sequence.SetFeatures(featBytes.Bytes()) annot := sequence.Annotations() annot["scientific_name"] = scientificName diff --git a/pkg/obiformats/fastseq_read.go b/pkg/obiformats/fastseq_read.go index 605b88e..5cada79 100644 --- a/pkg/obiformats/fastseq_read.go +++ b/pkg/obiformats/fastseq_read.go @@ -17,7 +17,7 @@ import ( "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" ) -func __fastseq_reader__(seqfile C.fast_kseq_p, +func _FastseqReader(seqfile C.fast_kseq_p, iterator obiseq.IBioSequenceBatch, batch_size int) { var comment string @@ -30,8 +30,12 @@ func __fastseq_reader__(seqfile C.fast_kseq_p, s := seqfile.seq - sequence := C.GoBytes(unsafe.Pointer(s.seq.s), - C.int(s.seq.l)) + csequence := cutils.ByteSlice(unsafe.Pointer(s.seq.s), int(s.seq.l)) + sequence := obiseq.GetSlice() + sequence = append(sequence, csequence...) + + //sequence := C.GoBytes(unsafe.Pointer(s.seq.s), + // C.int(s.seq.l)) name := C.GoString(s.name.s) @@ -45,11 +49,11 @@ func __fastseq_reader__(seqfile C.fast_kseq_p, if s.qual.l > C.ulong(0) { cquality := cutils.ByteSlice(unsafe.Pointer(s.qual.s), int(s.qual.l)) - quality := make(obiseq.Quality, s.qual.l) l := int(s.qual.l) + quality := obiseq.GetSlice() shift := uint8(seqfile.shift) for j := 0; j < l; j++ { - quality[j] = uint8(cquality[j]) - shift + quality = append(quality, uint8(cquality[j])-shift) } rep.SetQualities(quality) @@ -116,7 +120,7 @@ func ReadFastSeqBatchFromFile(filename string, options ...WithOption) (obiseq.IB log.Println("Start of the fastq file reading") - go __fastseq_reader__(pointer, newIter, opt.BatchSize()) + go _FastseqReader(pointer, newIter, opt.BatchSize()) parser := opt.ParseFastSeqHeader() if parser != nil { return IParseFastSeqHeaderBatch(newIter, options...), err @@ -141,7 +145,7 @@ func ReadFastSeqBatchFromStdin(options ...WithOption) obiseq.IBioSequenceBatch { close(newIter.Channel()) }() - go __fastseq_reader__(C.open_fast_sek_stdin(C.int32_t(opt.QualityShift())), newIter, opt.BatchSize()) + go _FastseqReader(C.open_fast_sek_stdin(C.int32_t(opt.QualityShift())), newIter, opt.BatchSize()) return newIter } diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index 3a37f7c..f3bf1a0 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -1,7 +1,6 @@ package obiseq import ( - "bytes" "crypto/md5" "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" @@ -23,20 +22,53 @@ func __make_default_qualities__(length int) Quality { type Annotation map[string]interface{} -type __sequence__ struct { - id bytes.Buffer - definition bytes.Buffer - sequence bytes.Buffer - qualities bytes.Buffer - feature bytes.Buffer +type _BioSequence struct { + id string + definition string + sequence []byte + qualities []byte + feature []byte annotations Annotation } type BioSequence struct { - sequence *__sequence__ + sequence *_BioSequence } -type BioSequenceSlice []BioSequence +func MakeEmptyBioSequence() BioSequence { + bs := _BioSequence{ + id: "", + definition: "", + sequence: nil, + qualities: nil, + feature: nil, + annotations: nil, + } + return BioSequence{&bs} +} + +func MakeBioSequence(id string, + sequence []byte, + definition string) BioSequence { + bs := MakeEmptyBioSequence() + bs.SetId(id) + bs.SetSequence(sequence) + bs.SetDefinition(definition) + return bs +} + +func (sequence *BioSequence) Recycle() { + + pseq := sequence.sequence + + RecycleSlice(pseq.sequence) + RecycleSlice(pseq.feature) + RecycleSlice(pseq.feature) + + RecycleAnnotation(pseq.annotations) + + sequence.sequence = nil +} var NilBioSequence = BioSequence{sequence: nil} @@ -44,75 +76,66 @@ func (s BioSequence) IsNil() bool { return s.sequence == nil } -func (s *BioSequence) Reset() { - s.sequence.id.Reset() - s.sequence.definition.Reset() - s.sequence.sequence.Reset() - s.sequence.qualities.Reset() - s.sequence.feature.Reset() - - for k := range s.sequence.annotations { - delete(s.sequence.annotations, k) - } - -} - func (s BioSequence) Copy() BioSequence { - new_seq := MakeEmptyBioSequence() - new_seq.sequence.id.Write(s.sequence.id.Bytes()) - new_seq.sequence.definition.Write(s.sequence.definition.Bytes()) - new_seq.sequence.sequence.Write(s.sequence.sequence.Bytes()) - new_seq.sequence.qualities.Write(s.sequence.qualities.Bytes()) - new_seq.sequence.feature.Write(s.sequence.feature.Bytes()) + newSeq := MakeEmptyBioSequence() + + newSeq.sequence.id = s.sequence.id + newSeq.sequence.definition = s.sequence.definition + + newSeq.sequence.sequence = GetSlice(s.sequence.sequence...) + newSeq.sequence.qualities = GetSlice(s.sequence.qualities...) + newSeq.sequence.feature = GetSlice(s.sequence.feature...) if len(s.sequence.annotations) > 0 { - goutils.CopyMap(new_seq.sequence.annotations, - s.sequence.annotations) + newSeq.sequence.annotations = GetAnnotation(s.sequence.annotations) } - return new_seq + return newSeq } func (s BioSequence) Id() string { - return s.sequence.id.String() + return s.sequence.id } func (s BioSequence) Definition() string { - return s.sequence.definition.String() + return s.sequence.definition } func (s BioSequence) Sequence() []byte { - return s.sequence.sequence.Bytes() + return s.sequence.sequence } func (s BioSequence) String() string { - return s.sequence.sequence.String() + return string(s.sequence.sequence) } func (s BioSequence) Length() int { - return s.sequence.sequence.Len() + return len(s.sequence.sequence) } func (s BioSequence) HasQualities() bool { - return s.sequence.qualities.Len() > 0 + return len(s.sequence.qualities) > 0 } func (s BioSequence) Qualities() Quality { if s.HasQualities() { - return s.sequence.qualities.Bytes() + return s.sequence.qualities } else { - return __make_default_qualities__(s.sequence.sequence.Len()) + return __make_default_qualities__(len(s.sequence.sequence)) } } func (s BioSequence) Features() string { - return s.sequence.feature.String() + return string(s.sequence.feature) } func (s BioSequence) Annotations() Annotation { + if s.sequence.annotations == nil { + s.sequence.annotations = GetAnnotation() + } return s.sequence.annotations } func (s BioSequence) MD5() [16]byte { - return md5.Sum(s.sequence.sequence.Bytes()) + return md5.Sum(s.sequence.sequence) } func (s BioSequence) Count() int { @@ -144,50 +167,55 @@ func (s BioSequence) Taxid() int { } func (s BioSequence) SetId(id string) { - s.sequence.id.Reset() - s.sequence.id.WriteString(id) + s.sequence.id = id } func (s BioSequence) SetDefinition(definition string) { - s.sequence.definition.Reset() - s.sequence.definition.WriteString(definition) + s.sequence.definition = definition } -func (s BioSequence) SetFeatures(feature string) { - s.sequence.feature.Reset() - s.sequence.feature.WriteString(feature) +func (s BioSequence) SetFeatures(feature []byte) { + if cap(s.sequence.feature) >= 300 { + RecycleSlice(s.sequence.feature) + } + s.sequence.feature = feature } func (s BioSequence) SetSequence(sequence []byte) { - s.sequence.sequence.Reset() - s.sequence.sequence.Write(sequence) + if s.sequence.sequence != nil { + RecycleSlice(s.sequence.sequence) + } + s.sequence.sequence = sequence } func (s BioSequence) SetQualities(qualities Quality) { - s.sequence.qualities.Reset() - s.sequence.qualities.Write(qualities) + if s.sequence.qualities != nil { + RecycleSlice(s.sequence.qualities) + } + s.sequence.qualities = qualities } func (s BioSequence) WriteQualities(data []byte) (int, error) { - return s.sequence.qualities.Write(data) + s.sequence.qualities = append(s.sequence.qualities, data...) + return len(data), nil } func (s BioSequence) WriteByteQualities(data byte) error { - return s.sequence.qualities.WriteByte(data) + s.sequence.qualities = append(s.sequence.qualities, data) + return nil } func (s BioSequence) Write(data []byte) (int, error) { - return s.sequence.sequence.Write(data) + s.sequence.sequence = append(s.sequence.sequence, data...) + return len(data), nil } func (s BioSequence) WriteString(data string) (int, error) { - return s.sequence.sequence.WriteString(data) + bdata := []byte(data) + return s.Write(bdata) } func (s BioSequence) WriteByte(data byte) error { - return s.sequence.sequence.WriteByte(data) -} - -func (s BioSequence) WriteRune(data rune) (int, error) { - return s.sequence.sequence.WriteRune(data) + s.sequence.sequence = append(s.sequence.sequence, data) + return nil } diff --git a/pkg/obiseq/biosequenceslice.go b/pkg/obiseq/biosequenceslice.go new file mode 100644 index 0000000..9319095 --- /dev/null +++ b/pkg/obiseq/biosequenceslice.go @@ -0,0 +1,4 @@ +package obiseq + +type BioSequenceSlice []BioSequence + diff --git a/pkg/obiseq/pool.go b/pkg/obiseq/pool.go index a183d50..3bef790 100644 --- a/pkg/obiseq/pool.go +++ b/pkg/obiseq/pool.go @@ -2,33 +2,81 @@ package obiseq import ( "sync" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" ) -var __bioseq__pool__ = sync.Pool{ +var _BioSequenceByteSlicePool = sync.Pool{ New: func() interface{} { - var bs __sequence__ - bs.annotations = make(Annotation, 50) + bs := make([]byte, 0, 300) return &bs }, } -func MakeEmptyBioSequence() BioSequence { - bs := BioSequence{__bioseq__pool__.Get().(*__sequence__)} - return bs +func RecycleSlice(s []byte) { + s0 := s[:0] + _BioSequenceByteSlicePool.Put(&s0) } -func MakeBioSequence(id string, - sequence []byte, - definition string) BioSequence { - bs := MakeEmptyBioSequence() - bs.SetId(id) - bs.Write(sequence) - bs.SetDefinition(definition) - return bs +func GetSlice(values ...byte) []byte { + s := *(_BioSequenceByteSlicePool.Get().(*[]byte)) + + if len(values) > 0 { + s = append(s, values...) + } + + return s } -func (sequence *BioSequence) Recycle() { - sequence.Reset() - __bioseq__pool__.Put(sequence.sequence) - sequence.sequence = nil +var BioSequenceAnnotationPool = sync.Pool{ + New: func() interface{} { + bs := make(Annotation, 100) + return &bs + }, } + +func RecycleAnnotation(a Annotation) { + for k := range a { + delete(a, k) + } + BioSequenceAnnotationPool.Put(&(a)) +} + +func GetAnnotation(values ...Annotation) Annotation { + a := *(BioSequenceAnnotationPool.Get().(*Annotation)) + + if len(values) > 0 { + goutils.CopyMap(a, values[0]) + } + + return a +} + +// var __bioseq__pool__ = sync.Pool{ +// New: func() interface{} { +// var bs _BioSequence +// bs.annotations = make(Annotation, 50) +// return &bs +// }, +// } + +// func MakeEmptyBioSequence() BioSequence { +// bs := BioSequence{__bioseq__pool__.Get().(*_BioSequence)} +// return bs +// } + +// func MakeBioSequence(id string, +// sequence []byte, +// definition string) BioSequence { +// bs := MakeEmptyBioSequence() +// bs.SetId(id) +// bs.Write(sequence) +// bs.SetDefinition(definition) +// return bs +// } + +// func (sequence *BioSequence) Recycle() { +// sequence.Reset() +// __bioseq__pool__.Put(sequence.sequence) +// sequence.sequence = nil +// } diff --git a/pkg/obiseq/revcomp.go b/pkg/obiseq/revcomp.go index be586d9..5357ec7 100644 --- a/pkg/obiseq/revcomp.go +++ b/pkg/obiseq/revcomp.go @@ -11,7 +11,7 @@ func (sequence BioSequence) ReverseComplement(inplace bool) BioSequence { sequence = sequence.Copy() } - s := sequence.sequence.sequence.Bytes() + s := sequence.sequence.sequence for i, j := sequence.Length()-1, 0; i >= j; i-- { @@ -20,7 +20,5 @@ func (sequence BioSequence) ReverseComplement(inplace bool) BioSequence { j++ } - sequence.sequence.id.WriteString("_revcomp") - return sequence } diff --git a/pkg/obiseq/subseq.go b/pkg/obiseq/subseq.go index 2255920..9571b1d 100644 --- a/pkg/obiseq/subseq.go +++ b/pkg/obiseq/subseq.go @@ -3,8 +3,6 @@ package obiseq import ( "errors" "fmt" - - "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" ) // Returns a sub sequence start from position 'from' included, @@ -23,21 +21,22 @@ func (sequence BioSequence) Subsequence(from, to int, circular bool) (BioSequenc return NilBioSequence, errors.New("to out of bounds") } - var new_seq BioSequence + var newSeq BioSequence if from < to { - new_seq = MakeEmptyBioSequence() - new_seq.Write(sequence.Sequence()[from:to]) - fmt.Fprintf(&new_seq.sequence.id, "%s_sub[%d..%d]", sequence.Id(), from+1, to) - new_seq.sequence.definition.Write(sequence.sequence.definition.Bytes()) + newSeq = MakeEmptyBioSequence() + newSeq.Write(sequence.Sequence()[from:to]) + + newSeq.sequence.id = fmt.Sprintf("%s_sub[%d..%d]", sequence.Id(), from+1, to) + newSeq.sequence.definition = sequence.sequence.definition } else { - new_seq, _ = sequence.Subsequence(from, sequence.Length(), false) - new_seq.Write(sequence.Sequence()[0:to]) + newSeq, _ = sequence.Subsequence(from, sequence.Length(), false) + newSeq.Write(sequence.Sequence()[0:to]) } if len(sequence.Annotations()) > 0 { - goutils.CopyMap(new_seq.Annotations(), sequence.Annotations()) + newSeq.sequence.annotations = GetAnnotation(sequence.Annotations()) } - return new_seq, nil + return newSeq, nil } diff --git a/pkg/obitools/obipairing/pairing.go b/pkg/obitools/obipairing/pairing.go index 77ee643..8c3219c 100644 --- a/pkg/obitools/obipairing/pairing.go +++ b/pkg/obitools/obipairing/pairing.go @@ -123,7 +123,7 @@ func AssemblePESequences(seqA, seqB obiseq.BioSequence, func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch, gap, delta, overlapMin int, withStats bool, sizes ...int) obiseq.IBioSequenceBatch { - nworkers := runtime.NumCPU() - 1 + nworkers := runtime.NumCPU() * 3 / 2 buffsize := iterator.BufferSize() if len(sizes) > 0 { @@ -185,7 +185,7 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch, newIter.Done() } - log.Printf("Start of the sequence Pairing") + log.Printf("Start of the sequence Pairing using %d workers\n", nworkers) for i := 0; i < nworkers-1; i++ { go f(iterator.Split(), i)