before big changes

This commit is contained in:
2022-02-18 22:53:09 +01:00
parent 37ce3536e1
commit 9737f97084
15 changed files with 234 additions and 91 deletions

View File

@ -1,7 +1,9 @@
package main package main
import ( import (
"log"
"os" "os"
"runtime/pprof"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
@ -11,12 +13,12 @@ import (
func main() { func main() {
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof // go tool pprof -http=":8000" ./obipairing ./cpu.pprof
// f, err := os.Create("cpu.pprof") f, err := os.Create("cpu.pprof")
// if err != nil { if err != nil {
// log.Fatal(err) log.Fatal(err)
// } }
// pprof.StartCPUProfile(f) pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile() defer pprof.StopCPUProfile()
// go tool trace cpu.trace // go tool trace cpu.trace
// ftrace, err := os.Create("cpu.trace") // ftrace, err := os.Create("cpu.trace")

View File

@ -34,7 +34,7 @@ func find(root, ext string) []string {
} }
func ISequenceChunkOnDisk(iterator obiseq.IBioSequenceBatch, func ISequenceChunkOnDisk(iterator obiseq.IBioSequenceBatch,
classifier obiseq.BioSequenceClassifier, classifier *obiseq.BioSequenceClassifier,
sizes ...int) (obiseq.IBioSequenceBatch, error) { sizes ...int) (obiseq.IBioSequenceBatch, error) {
dir, err := tempDir() dir, err := tempDir()
if err != nil { if err != nil {
@ -78,7 +78,7 @@ func ISequenceChunkOnDisk(iterator obiseq.IBioSequenceBatch,
panic(err) panic(err)
} }
chunck := make(obiseq.BioSequenceSlice, 0, 1000) chunck := make(obiseq.BioSequenceSlice, 0, 10000)
for iseq.Next() { for iseq.Next() {
b := iseq.Get() b := iseq.Get()

View File

@ -8,7 +8,7 @@ import (
) )
func ISequenceChunk(iterator obiseq.IBioSequenceBatch, func ISequenceChunk(iterator obiseq.IBioSequenceBatch,
classifier obiseq.BioSequenceClassifier, classifier *obiseq.BioSequenceClassifier,
sizes ...int) (obiseq.IBioSequenceBatch, error) { sizes ...int) (obiseq.IBioSequenceBatch, error) {
bufferSize := iterator.BufferSize() bufferSize := iterator.BufferSize()
@ -32,27 +32,28 @@ func ISequenceChunk(iterator obiseq.IBioSequenceBatch,
dispatcher := iterator.Distribute(classifier) dispatcher := iterator.Distribute(classifier)
jobDone := sync.WaitGroup{} jobDone := sync.WaitGroup{}
chunks := make(map[string]*obiseq.BioSequenceSlice, 100) chunks := make(map[int]*obiseq.BioSequenceSlice, 1000)
for newflux := range dispatcher.News() { for newflux := range dispatcher.News() {
jobDone.Add(1) jobDone.Add(1)
go func(newflux string) { go func(newflux int) {
data, err := dispatcher.Outputs(newflux) data, err := dispatcher.Outputs(newflux)
if err != nil { if err != nil {
log.Fatalf("Cannot retreive the new chanel : %v", err) log.Fatalf("Cannot retreive the new chanel : %v", err)
} }
chunk := make(obiseq.BioSequenceSlice, 0, 1000) chunk := obiseq.GetBioSequenceSlicePtr()
lock.Lock()
chunks[newflux] = chunk
lock.Unlock()
for data.Next() { for data.Next() {
b := data.Get() b := data.Get()
chunk = append(chunk, b.Slice()...) *chunk = append(*chunk, b.Slice()...)
b.Recycle()
} }
lock.Lock()
chunks[newflux] = &chunk
lock.Unlock()
jobDone.Done() jobDone.Done()
}(newflux) }(newflux)
} }

View File

@ -7,7 +7,7 @@ import (
) )
func ISequenceSubChunk(iterator obiseq.IBioSequenceBatch, func ISequenceSubChunk(iterator obiseq.IBioSequenceBatch,
classifier obiseq.BioSequenceClassifier, classifier *obiseq.BioSequenceClassifier,
sizes ...int) (obiseq.IBioSequenceBatch, error) { sizes ...int) (obiseq.IBioSequenceBatch, error) {
bufferSize := iterator.BufferSize() bufferSize := iterator.BufferSize()
@ -42,33 +42,31 @@ func ISequenceSubChunk(iterator obiseq.IBioSequenceBatch,
} }
ff := func(iterator obiseq.IBioSequenceBatch) { ff := func(iterator obiseq.IBioSequenceBatch) {
chunks := make(map[string]*obiseq.BioSequenceSlice, 100) chunks := make(map[int]*obiseq.BioSequenceSlice, 100)
for iterator.Next() { for iterator.Next() {
batch := iterator.Get() batch := iterator.Get()
for _, s := range batch.Slice() { for _, s := range batch.Slice() {
key := classifier(s) key := classifier.Code(s)
slice, ok := chunks[key] slice, ok := chunks[key]
if !ok { if !ok {
is := make(obiseq.BioSequenceSlice, 0, len(batch.Slice())) slice = obiseq.GetBioSequenceSlicePtr()
slice = &is
chunks[key] = slice chunks[key] = slice
} }
*slice = append(*slice, s) *slice = append(*slice, s)
} }
n := 0
for k, chunck := range chunks { for k, chunck := range chunks {
n += len(*chunck)
newIter.Channel() <- obiseq.MakeBioSequenceBatch(nextOrder(), *chunck...) newIter.Channel() <- obiseq.MakeBioSequenceBatch(nextOrder(), *chunck...)
delete(chunks, k) delete(chunks, k)
} }
batch.Recycle()
} }
newIter.Done() newIter.Done()

View File

@ -34,6 +34,7 @@ func IUniqueSequence(iterator obiseq.IBioSequenceBatch,
} }
nworkers := opts.ParallelWorkers() nworkers := opts.ParallelWorkers()
iUnique.Add(nworkers) iUnique.Add(nworkers)
go func() { go func() {
@ -52,17 +53,26 @@ func IUniqueSequence(iterator obiseq.IBioSequenceBatch,
return neworder return neworder
} }
var ff func(obiseq.IBioSequenceBatch, obiseq.BioSequenceClassifier, int) var ff func(obiseq.IBioSequenceBatch, *obiseq.BioSequenceClassifier, int)
cat := opts.Categories() cat := opts.Categories()
na := opts.NAValue() na := opts.NAValue()
// ff = func(input obiseq.IBioSequenceBatch,
// classifier obiseq.BioSequenceClassifier,
// icat int) {
// log.Println(na, nextOrder)
// input.Recycle()
// iUnique.Done()
// }
ff = func(input obiseq.IBioSequenceBatch, ff = func(input obiseq.IBioSequenceBatch,
classifier obiseq.BioSequenceClassifier, classifier *obiseq.BioSequenceClassifier,
icat int) { icat int) {
icat-- icat--
input, err = ISequenceSubChunk(input, input, err = ISequenceSubChunk(input,
classifier, classifier,
1,
opts.BufferSize()) opts.BufferSize())
var next obiseq.IBioSequenceBatch var next obiseq.IBioSequenceBatch

View File

@ -23,7 +23,7 @@ func WriterDispatcher(prototypename string,
go func() { go func() {
for newflux := range dispatcher.News() { for newflux := range dispatcher.News() {
jobDone.Add(1) jobDone.Add(1)
go func(newflux string) { go func(newflux int) {
data, err := dispatcher.Outputs(newflux) data, err := dispatcher.Outputs(newflux)
if err != nil { if err != nil {
@ -35,7 +35,7 @@ func WriterDispatcher(prototypename string,
options...) options...)
if err != nil { if err != nil {
log.Fatalf("cannot open the output file for key %s", newflux) log.Fatalf("cannot open the output file for key %d", newflux)
} }
out.Recycle() out.Recycle()

View File

@ -24,7 +24,7 @@ func _FastseqReader(seqfile C.fast_kseq_p,
i := 0 i := 0
ii := 0 ii := 0
slice := make(obiseq.BioSequenceSlice, 0, batch_size) slice := obiseq.GetBioSequenceSlice()
for l := int64(C.next_fast_sek(seqfile)); l > 0; l = int64(C.next_fast_sek(seqfile)) { for l := int64(C.next_fast_sek(seqfile)); l > 0; l = int64(C.next_fast_sek(seqfile)) {

View File

@ -43,6 +43,11 @@ func (batch BioSequenceBatch) IsNil() bool {
return batch.slice == nil return batch.slice == nil
} }
func (batch BioSequenceBatch) Recycle() {
batch.slice.Recycle()
batch.slice = nil
}
// Structure implementing an iterator over bioseq.BioSequenceBatch // Structure implementing an iterator over bioseq.BioSequenceBatch
// based on a channel. // based on a channel.
type _IBioSequenceBatch struct { type _IBioSequenceBatch struct {
@ -343,7 +348,7 @@ func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBa
go func() { go func() {
order := 0 order := 0
iterator = iterator.SortBatches() iterator = iterator.SortBatches()
buffer := make(BioSequenceSlice, 0, size) buffer := GetBioSequenceSlice()
for iterator.Next() { for iterator.Next() {
seqs := iterator.Get() seqs := iterator.Get()
@ -352,9 +357,10 @@ func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBa
if len(buffer) == size { if len(buffer) == size {
newIter.Channel() <- MakeBioSequenceBatch(order, buffer...) newIter.Channel() <- MakeBioSequenceBatch(order, buffer...)
order++ order++
buffer = make(BioSequenceSlice, 0, size) buffer = GetBioSequenceSlice()
} }
} }
seqs.Recycle()
} }
if len(buffer) > 0 { if len(buffer) > 0 {
@ -449,8 +455,8 @@ func (iterator IBioSequenceBatch) DivideOn(predicate SequencePredicate,
falseOrder := 0 falseOrder := 0
iterator = iterator.SortBatches() iterator = iterator.SortBatches()
trueSlice := make(BioSequenceSlice, 0, size) trueSlice := GetBioSequenceSlice()
falseSlice := make(BioSequenceSlice, 0, size) falseSlice := GetBioSequenceSlice()
for iterator.Next() { for iterator.Next() {
seqs := iterator.Get() seqs := iterator.Get()
@ -464,15 +470,16 @@ func (iterator IBioSequenceBatch) DivideOn(predicate SequencePredicate,
if len(trueSlice) == size { if len(trueSlice) == size {
trueIter.Channel() <- MakeBioSequenceBatch(trueOrder, trueSlice...) trueIter.Channel() <- MakeBioSequenceBatch(trueOrder, trueSlice...)
trueOrder++ trueOrder++
trueSlice = make(BioSequenceSlice, 0, size) trueSlice = GetBioSequenceSlice()
} }
if len(falseSlice) == size { if len(falseSlice) == size {
falseIter.Channel() <- MakeBioSequenceBatch(falseOrder, falseSlice...) falseIter.Channel() <- MakeBioSequenceBatch(falseOrder, falseSlice...)
falseOrder++ falseOrder++
falseSlice = make(BioSequenceSlice, 0, size) falseSlice = GetBioSequenceSlice()
} }
} }
seqs.Recycle()
} }
if len(trueSlice) > 0 { if len(trueSlice) > 0 {

View File

@ -62,11 +62,11 @@ func (sequence *BioSequence) Recycle() {
pseq := sequence.sequence pseq := sequence.sequence
if pseq != nil { if pseq != nil {
RecycleSlice(pseq.sequence) RecycleSlice(&pseq.sequence)
RecycleSlice(pseq.feature) RecycleSlice(&pseq.feature)
RecycleSlice(pseq.qualities) RecycleSlice(&pseq.qualities)
RecycleAnnotation(pseq.annotations) RecycleAnnotation(&pseq.annotations)
} }
sequence.sequence = nil sequence.sequence = nil
@ -187,21 +187,21 @@ func (s BioSequence) SetDefinition(definition string) {
func (s BioSequence) SetFeatures(feature []byte) { func (s BioSequence) SetFeatures(feature []byte) {
if cap(s.sequence.feature) >= 300 { if cap(s.sequence.feature) >= 300 {
RecycleSlice(s.sequence.feature) RecycleSlice(&s.sequence.feature)
} }
s.sequence.feature = feature s.sequence.feature = feature
} }
func (s BioSequence) SetSequence(sequence []byte) { func (s BioSequence) SetSequence(sequence []byte) {
if s.sequence.sequence != nil { if s.sequence.sequence != nil {
RecycleSlice(s.sequence.sequence) RecycleSlice(&s.sequence.sequence)
} }
s.sequence.sequence = sequence s.sequence.sequence = sequence
} }
func (s BioSequence) SetQualities(qualities Quality) { func (s BioSequence) SetQualities(qualities Quality) {
if s.sequence.qualities != nil { if s.sequence.qualities != nil {
RecycleSlice(s.sequence.qualities) RecycleSlice(&s.sequence.qualities)
} }
s.sequence.qualities = qualities s.sequence.qualities = qualities
} }

View File

@ -3,71 +3,163 @@ package obiseq
import ( import (
"fmt" "fmt"
"hash/crc32" "hash/crc32"
"log"
"strconv" "strconv"
"sync"
) )
type BioSequenceClassifier func(sequence BioSequence) string type BioSequenceClassifier struct {
Code func(BioSequence) int
Value func(int) string
}
func AnnotationClassifier(key string, na string) BioSequenceClassifier { //type BioSequenceClassifier func(sequence BioSequence) string
f := func(sequence BioSequence) string {
func AnnotationClassifier(key string, na string) *BioSequenceClassifier {
encode := make(map[string]int, 1000)
decode := make([]string, 0, 1000)
locke := sync.RWMutex{}
maxcode := 0
code := func(sequence BioSequence) int {
var val string
if sequence.HasAnnotation() { if sequence.HasAnnotation() {
value, ok := sequence.Annotations()[key] value, ok := sequence.Annotations()[key]
if ok { if ok {
switch value := value.(type) { switch value := value.(type) {
case string: case string:
return value val = value
default: default:
return fmt.Sprint(value) val = fmt.Sprint(value)
} }
} }
} }
return na val = na
locke.Lock()
defer locke.Unlock()
k, ok := encode[val]
if !ok {
k = maxcode
maxcode++
encode[val] = k
decode = append(decode, val)
}
return k
} }
return f value := func(k int) string {
locke.RLock()
defer locke.RUnlock()
if k >= maxcode {
log.Fatalf("value %d not register")
}
return decode[k]
}
c := BioSequenceClassifier{code, value}
return &c
} }
func PredicateClassifier(predicate SequencePredicate) BioSequenceClassifier { func PredicateClassifier(predicate SequencePredicate) *BioSequenceClassifier {
f := func(sequence BioSequence) string { code := func(sequence BioSequence) int {
if predicate(sequence) { if predicate(sequence) {
return "true" return 1
} else { } else {
return "false" return 0
} }
} }
return f value := func(k int) string {
if k == 0 {
return "false"
} else {
return "true"
}
}
c := BioSequenceClassifier{code, value}
return &c
} }
// Builds a classifier function based on CRC32 of the sequence // Builds a classifier function based on CRC32 of the sequence
// //
func HashClassifier(size int) BioSequenceClassifier { func HashClassifier(size int) *BioSequenceClassifier {
f := func(sequence BioSequence) string { code := func(sequence BioSequence) int {
h := crc32.ChecksumIEEE(sequence.Sequence()) % uint32(size) return int(crc32.ChecksumIEEE(sequence.Sequence()) % uint32(size))
return strconv.Itoa(int(h))
} }
return f value := func(k int) string {
return strconv.Itoa(k)
}
c := BioSequenceClassifier{code, value}
return &c
} }
// Builds a classifier function based on the sequence // Builds a classifier function based on the sequence
// //
func SequenceClassifier() BioSequenceClassifier { func SequenceClassifier() *BioSequenceClassifier {
f := func(sequence BioSequence) string { encode := make(map[string]int, 1000)
return sequence.String() decode := make([]string, 0, 1000)
locke := sync.RWMutex{}
maxcode := 0
code := func(sequence BioSequence) int {
val := sequence.String()
locke.Lock()
defer locke.Unlock()
k, ok := encode[val]
if !ok {
k = maxcode
maxcode++
encode[val] = k
decode = append(decode, val)
}
return k
} }
return f value := func(k int) string {
locke.RLock()
defer locke.RUnlock()
if k >= maxcode {
log.Fatalf("value %d not register")
}
return decode[k]
}
c := BioSequenceClassifier{code, value}
return &c
} }
func RotateClassifier(size int) BioSequenceClassifier { func RotateClassifier(size int) *BioSequenceClassifier {
n := 0 n := 0
f := func(sequence BioSequence) string { lock := sync.Mutex{}
h := n % size
code := func(sequence BioSequence) int {
lock.Lock()
defer lock.Unlock()
n = n % size
n++ n++
return strconv.Itoa(int(h)) return n
} }
return f value := func(k int) string {
return strconv.Itoa(k)
}
c := BioSequenceClassifier{code, value}
return &c
} }

View File

@ -6,35 +6,35 @@ import (
) )
type IDistribute struct { type IDistribute struct {
outputs map[string]IBioSequenceBatch outputs map[int]IBioSequenceBatch
news chan string news chan int
lock *sync.Mutex lock *sync.Mutex
} }
func (dist *IDistribute) Outputs(key string) (IBioSequenceBatch, error) { func (dist *IDistribute) Outputs(key int) (IBioSequenceBatch, error) {
dist.lock.Lock() dist.lock.Lock()
iter, ok := dist.outputs[key] iter, ok := dist.outputs[key]
dist.lock.Unlock() dist.lock.Unlock()
if !ok { if !ok {
return NilIBioSequenceBatch, fmt.Errorf("key %s unknown", key) return NilIBioSequenceBatch, fmt.Errorf("code %d unknown", key)
} }
return iter, nil return iter, nil
} }
func (dist *IDistribute) News() chan string { func (dist *IDistribute) News() chan int {
return dist.news return dist.news
} }
func (iterator IBioSequenceBatch) Distribute(class BioSequenceClassifier, sizes ...int) IDistribute { func (iterator IBioSequenceBatch) Distribute(class *BioSequenceClassifier, sizes ...int) IDistribute {
batchsize := 5000 batchsize := 5000
buffsize := 2 buffsize := 2
outputs := make(map[string]IBioSequenceBatch, 100) outputs := make(map[int]IBioSequenceBatch, 100)
slices := make(map[string]*BioSequenceSlice, 100) slices := make(map[int]*BioSequenceSlice, 100)
orders := make(map[string]int, 100) orders := make(map[int]int, 100)
news := make(chan string) news := make(chan int)
if len(sizes) > 0 { if len(sizes) > 0 {
batchsize = sizes[0] batchsize = sizes[0]
@ -63,11 +63,11 @@ func (iterator IBioSequenceBatch) Distribute(class BioSequenceClassifier, sizes
for iterator.Next() { for iterator.Next() {
seqs := iterator.Get() seqs := iterator.Get()
for _, s := range seqs.Slice() { for _, s := range seqs.Slice() {
key := class(s) key := class.Code(s)
slice, ok := slices[key] slice, ok := slices[key]
if !ok { if !ok {
s := make(BioSequenceSlice, 0, batchsize) s := GetBioSequenceSlice()
slice = &s slice = &s
slices[key] = slice slices[key] = slice
orders[key] = 0 orders[key] = 0
@ -84,10 +84,11 @@ func (iterator IBioSequenceBatch) Distribute(class BioSequenceClassifier, sizes
if len(*slice) == batchsize { if len(*slice) == batchsize {
outputs[key].Channel() <- MakeBioSequenceBatch(orders[key], *slice...) outputs[key].Channel() <- MakeBioSequenceBatch(orders[key], *slice...)
orders[key]++ orders[key]++
s := make(BioSequenceSlice, 0, batchsize) s := GetBioSequenceSlice()
slices[key] = &s slices[key] = &s
} }
} }
seqs.Recycle()
} }
for key, slice := range slices { for key, slice := range slices {

View File

@ -166,7 +166,7 @@ func (iterator IBioSequence) IBioSequenceBatch(sizes ...int) IBioSequenceBatch {
go func() { go func() {
for j := 0; !iterator.Finished(); j++ { for j := 0; !iterator.Finished(); j++ {
batch := BioSequenceBatch{ batch := BioSequenceBatch{
slice: make(BioSequenceSlice, 0, batchsize), slice: GetBioSequenceSlice(),
order: j} order: j}
for i := 0; i < batchsize && iterator.Next(); i++ { for i := 0; i < batchsize && iterator.Next(); i++ {
seq := iterator.Get() seq := iterator.Get()
@ -280,7 +280,7 @@ func (iterator IBioSequence) Tail(n int, sizes ...int) IBioSequence {
} }
newIter := MakeIBioSequence(buffsize) newIter := MakeIBioSequence(buffsize)
buffseq := make(BioSequenceSlice, n) buffseq := GetBioSequenceSlice()
newIter.Add(1) newIter.Add(1)

View File

@ -13,9 +13,9 @@ var _BioSequenceByteSlicePool = sync.Pool{
}, },
} }
func RecycleSlice(s []byte) { func RecycleSlice(s *[]byte) {
s0 := s[:0] *s = (*s)[:0]
_BioSequenceByteSlicePool.Put(&s0) _BioSequenceByteSlicePool.Put(s)
} }
func GetSlice(values ...byte) []byte { func GetSlice(values ...byte) []byte {
@ -35,10 +35,10 @@ var BioSequenceAnnotationPool = sync.Pool{
}, },
} }
func RecycleAnnotation(a Annotation) { func RecycleAnnotation(a *Annotation) {
if a != nil { if a != nil {
for k := range a { for k := range *a {
delete(a, k) delete(*a, k)
} }
BioSequenceAnnotationPool.Put(&(a)) BioSequenceAnnotationPool.Put(&(a))
} }
@ -54,6 +54,32 @@ func GetAnnotation(values ...Annotation) Annotation {
return a return a
} }
var _BioSequenceSlicePool = sync.Pool{
New: func() interface{} {
bs := make(BioSequenceSlice, 0, 5000)
return &bs
},
}
func (s *BioSequenceSlice) Recycle() {
*s = (*s)[:0]
_BioSequenceSlicePool.Put(s)
}
func GetBioSequenceSlicePtr(values ...BioSequence) *BioSequenceSlice {
s := _BioSequenceSlicePool.Get().(*BioSequenceSlice)
if len(values) > 0 {
*s = append(*s, values...)
}
return s
}
func GetBioSequenceSlice(values ...BioSequence) BioSequenceSlice {
return *GetBioSequenceSlicePtr(values...)
}
// var __bioseq__pool__ = sync.Pool{ // var __bioseq__pool__ = sync.Pool{
// New: func() interface{} { // New: func() interface{} {
// var bs _BioSequence // var bs _BioSequence

6
pkg/obiseq/speed.go Normal file
View File

@ -0,0 +1,6 @@
package obiseq
func (iterator IBioSequenceBatch) speed() IBioSequenceBatch {
newIter := MakeIBioSequenceBatch()
return newIter
}

View File

@ -48,7 +48,7 @@ func OptionSet(options *getoptions.GetOpt) {
DistributeOptionSet(options) DistributeOptionSet(options)
} }
func CLISequenceClassifier() obiseq.BioSequenceClassifier { func CLISequenceClassifier() *obiseq.BioSequenceClassifier {
switch { switch {
case _SequenceClassifierTag != "": case _SequenceClassifierTag != "":
return obiseq.AnnotationClassifier(_SequenceClassifierTag, _NAValue) return obiseq.AnnotationClassifier(_SequenceClassifierTag, _NAValue)