Big change iin the data model, and a first version of obiuniq

This commit is contained in:
2022-02-21 19:00:23 +01:00
parent 9737f97084
commit 2e7c1834b0
43 changed files with 664 additions and 440 deletions

View File

@@ -5,6 +5,7 @@ import (
"log"
"sync"
"sync/atomic"
"time"
"github.com/tevino/abool/v2"
)
@@ -16,7 +17,7 @@ type BioSequenceBatch struct {
var NilBioSequenceBatch = BioSequenceBatch{nil, -1}
func MakeBioSequenceBatch(order int, sequences ...BioSequence) BioSequenceBatch {
func MakeBioSequenceBatch(order int, sequences BioSequenceSlice) BioSequenceBatch {
return BioSequenceBatch{
slice: sequences,
order: order,
@@ -39,6 +40,15 @@ func (batch BioSequenceBatch) Slice() BioSequenceSlice {
func (batch BioSequenceBatch) Length() int {
return len(batch.slice)
}
func (batch BioSequenceBatch) NotEmpty() bool {
return batch.slice.NotEmpty()
}
func (batch BioSequenceBatch) Pop0() *BioSequence {
return batch.slice.Pop0()
}
func (batch BioSequenceBatch) IsNil() bool {
return batch.slice == nil
}
@@ -201,6 +211,30 @@ func (iterator IBioSequenceBatch) Get() BioSequenceBatch {
return iterator.pointer.current
}
func (iterator IBioSequenceBatch) Push(batch BioSequenceBatch) {
if batch.IsNil() {
log.Panicln("An Nil batch is pushed on the channel")
}
if batch.Length() == 0 {
log.Panicln("An empty batch is pushed on the channel")
}
iterator.pointer.channel <- batch
}
func (iterator IBioSequenceBatch) Close() {
close(iterator.pointer.channel)
}
func (iterator IBioSequenceBatch) WaitAndClose() {
iterator.Wait()
for len(iterator.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
iterator.Close()
}
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator IBioSequenceBatch) Finished() bool {
@@ -227,9 +261,10 @@ func (iterator IBioSequenceBatch) IBioSequence(sizes ...int) IBioSequence {
for iterator.Next() {
batch := iterator.Get()
for _, s := range batch.slice {
newIter.pointer.channel <- s
for batch.NotEmpty() {
newIter.pointer.channel <- batch.Pop0()
}
batch.Recycle()
}
newIter.Done()
}()
@@ -304,7 +339,7 @@ func (iterator IBioSequenceBatch) Concat(iterators ...IBioSequenceBatch) IBioSeq
if s.order > max_order {
max_order = s.order
}
newIter.Channel() <- s.Reorder(s.order + previous_max)
newIter.Push(s.Reorder(s.order + previous_max))
}
previous_max = max_order + 1
@@ -315,7 +350,7 @@ func (iterator IBioSequenceBatch) Concat(iterators ...IBioSequenceBatch) IBioSeq
max_order = s.order + previous_max
}
newIter.Channel() <- s.Reorder(s.order + previous_max)
newIter.Push(s.Reorder(s.order + previous_max))
}
previous_max = max_order + 1
}
@@ -348,23 +383,23 @@ func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBa
go func() {
order := 0
iterator = iterator.SortBatches()
buffer := GetBioSequenceSlice()
buffer := MakeBioSequenceSlice()
for iterator.Next() {
seqs := iterator.Get()
for _, s := range seqs.slice {
buffer = append(buffer, s)
if len(buffer) == size {
newIter.Channel() <- MakeBioSequenceBatch(order, buffer...)
newIter.Push(MakeBioSequenceBatch(order, buffer))
order++
buffer = GetBioSequenceSlice()
buffer = MakeBioSequenceSlice()
}
}
seqs.Recycle()
}
if len(buffer) > 0 {
newIter.Channel() <- MakeBioSequenceBatch(order, buffer...)
newIter.Push(MakeBioSequenceBatch(order, buffer))
}
newIter.Done()
@@ -377,15 +412,17 @@ func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBa
func (iterator IBioSequenceBatch) Recycle() {
log.Println("Start recycling of Bioseq objects")
recycled := 0
for iterator.Next() {
// iterator.Get()
batch := iterator.Get()
for _, seq := range batch.Slice() {
(&seq).Recycle()
seq.Recycle()
recycled++
}
batch.Recycle()
}
log.Println("End of the recycling of Bioseq objects")
log.Printf("End of the recycling of %d Bioseq objects", recycled)
}
func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch, sizes ...int) IPairedBioSequenceBatch {
@@ -444,10 +481,8 @@ func (iterator IBioSequenceBatch) DivideOn(predicate SequencePredicate,
falseIter.Add(1)
go func() {
trueIter.Wait()
falseIter.Wait()
close(trueIter.Channel())
close(falseIter.Channel())
trueIter.WaitAndClose()
falseIter.WaitAndClose()
}()
go func() {
@@ -455,8 +490,8 @@ func (iterator IBioSequenceBatch) DivideOn(predicate SequencePredicate,
falseOrder := 0
iterator = iterator.SortBatches()
trueSlice := GetBioSequenceSlice()
falseSlice := GetBioSequenceSlice()
trueSlice := MakeBioSequenceSlice()
falseSlice := MakeBioSequenceSlice()
for iterator.Next() {
seqs := iterator.Get()
@@ -468,26 +503,26 @@ func (iterator IBioSequenceBatch) DivideOn(predicate SequencePredicate,
}
if len(trueSlice) == size {
trueIter.Channel() <- MakeBioSequenceBatch(trueOrder, trueSlice...)
trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice))
trueOrder++
trueSlice = GetBioSequenceSlice()
trueSlice = MakeBioSequenceSlice()
}
if len(falseSlice) == size {
falseIter.Channel() <- MakeBioSequenceBatch(falseOrder, falseSlice...)
falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice))
falseOrder++
falseSlice = GetBioSequenceSlice()
falseSlice = MakeBioSequenceSlice()
}
}
seqs.Recycle()
}
if len(trueSlice) > 0 {
trueIter.Channel() <- MakeBioSequenceBatch(trueOrder, trueSlice...)
trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice))
}
if len(falseSlice) > 0 {
falseIter.Channel() <- MakeBioSequenceBatch(falseOrder, falseSlice...)
falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice))
}
trueIter.Done()

View File

@@ -2,10 +2,22 @@ package obiseq
import (
"crypto/md5"
"log"
"sync/atomic"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
)
var _NewSeq = int32(0)
var _RecycleSeq = int32(0)
var _InMemSeq = int32(0)
var _MaxInMemSeq = int32(0)
var _BioLogRate = int(100000)
func LogBioSeqStatus() {
log.Printf("@@@@>>>> Created seq : %d Destroyed : %d In Memory : %d", _NewSeq, _RecycleSeq, _InMemSeq)
}
type Quality []uint8
var __default_qualities__ = make(Quality, 0, 500)
@@ -22,7 +34,7 @@ func __make_default_qualities__(length int) Quality {
type Annotation map[string]interface{}
type _BioSequence struct {
type BioSequence struct {
id string
definition string
sequence []byte
@@ -31,12 +43,17 @@ type _BioSequence struct {
annotations Annotation
}
type BioSequence struct {
sequence *_BioSequence
}
func MakeEmptyBioSequence() BioSequence {
bs := _BioSequence{
atomic.AddInt32(&_NewSeq, 1)
atomic.AddInt32(&_InMemSeq, 1)
//if atomic.CompareAndSwapInt32()()
// if int(_NewSeq)%int(_BioLogRate) == 0 {
// LogBioSeqStatus()
// }
return BioSequence{
id: "",
definition: "",
sequence: nil,
@@ -44,7 +61,11 @@ func MakeEmptyBioSequence() BioSequence {
feature: nil,
annotations: nil,
}
return BioSequence{&bs}
}
func NewEmptyBioSequence() *BioSequence {
s := MakeEmptyBioSequence()
return &s
}
func MakeBioSequence(id string,
@@ -57,104 +78,109 @@ func MakeBioSequence(id string,
return bs
}
func NewBioSequence(id string,
sequence []byte,
definition string) *BioSequence {
s := MakeBioSequence(id, sequence, definition)
return &s
}
func (sequence *BioSequence) Recycle() {
pseq := sequence.sequence
atomic.AddInt32(&_RecycleSeq, 1)
atomic.AddInt32(&_InMemSeq, -1)
if pseq != nil {
RecycleSlice(&pseq.sequence)
RecycleSlice(&pseq.feature)
RecycleSlice(&pseq.qualities)
// if int(_RecycleSeq)%int(_BioLogRate) == 0 {
// LogBioSeqStatus()
// }
RecycleAnnotation(&pseq.annotations)
if sequence != nil {
RecycleSlice(&sequence.sequence)
sequence.sequence = nil
RecycleSlice(&sequence.feature)
sequence.feature = nil
RecycleSlice(&sequence.qualities)
sequence.qualities = nil
RecycleAnnotation(&sequence.annotations)
sequence.annotations = nil
}
sequence.sequence = nil
}
var NilBioSequence = BioSequence{sequence: nil}
func (s BioSequence) IsNil() bool {
return s.sequence == nil
}
func (s BioSequence) Copy() BioSequence {
func (s *BioSequence) Copy() *BioSequence {
newSeq := MakeEmptyBioSequence()
newSeq.sequence.id = s.sequence.id
newSeq.sequence.definition = s.sequence.definition
newSeq.id = s.id
newSeq.definition = s.definition
newSeq.sequence.sequence = GetSlice(s.sequence.sequence...)
newSeq.sequence.qualities = GetSlice(s.sequence.qualities...)
newSeq.sequence.feature = GetSlice(s.sequence.feature...)
newSeq.sequence = GetSlice(s.sequence...)
newSeq.qualities = GetSlice(s.qualities...)
newSeq.feature = GetSlice(s.feature...)
if len(s.sequence.annotations) > 0 {
newSeq.sequence.annotations = GetAnnotation(s.sequence.annotations)
if len(s.annotations) > 0 {
newSeq.annotations = GetAnnotation(s.annotations)
}
return newSeq
return &newSeq
}
func (s BioSequence) Id() string {
return s.sequence.id
func (s *BioSequence) Id() string {
return s.id
}
func (s BioSequence) Definition() string {
return s.sequence.definition
func (s *BioSequence) Definition() string {
return s.definition
}
func (s BioSequence) Sequence() []byte {
return s.sequence.sequence
func (s *BioSequence) Sequence() []byte {
return s.sequence
}
func (s BioSequence) String() string {
return string(s.sequence.sequence)
func (s *BioSequence) String() string {
return string(s.sequence)
}
func (s BioSequence) Length() int {
return len(s.sequence.sequence)
func (s *BioSequence) Length() int {
return len(s.sequence)
}
func (s BioSequence) HasQualities() bool {
return len(s.sequence.qualities) > 0
func (s *BioSequence) HasQualities() bool {
return len(s.qualities) > 0
}
func (s BioSequence) Qualities() Quality {
func (s *BioSequence) Qualities() Quality {
if s.HasQualities() {
return s.sequence.qualities
return s.qualities
} else {
return __make_default_qualities__(len(s.sequence.sequence))
return __make_default_qualities__(len(s.sequence))
}
}
func (s BioSequence) Features() string {
return string(s.sequence.feature)
func (s *BioSequence) Features() string {
return string(s.feature)
}
func (s BioSequence) HasAnnotation() bool {
return len(s.sequence.annotations) > 0
func (s *BioSequence) HasAnnotation() bool {
return len(s.annotations) > 0
}
func (s BioSequence) Annotations() Annotation {
if s.sequence == nil {
return nil
func (s *BioSequence) Annotations() Annotation {
if s.annotations == nil {
s.annotations = GetAnnotation()
}
if s.sequence.annotations == nil {
s.sequence.annotations = GetAnnotation()
}
return s.sequence.annotations
return s.annotations
}
func (s BioSequence) MD5() [16]byte {
return md5.Sum(s.sequence.sequence)
func (s *BioSequence) MD5() [16]byte {
return md5.Sum(s.sequence)
}
func (s BioSequence) Count() int {
if s.sequence.annotations == nil {
func (s *BioSequence) Count() int {
if s.annotations == nil {
return 1
}
if val, ok := (s.sequence.annotations)["count"]; ok {
if val, ok := (s.annotations)["count"]; ok {
val, err := goutils.InterfaceToInt(val)
if err == nil {
return val
@@ -163,12 +189,12 @@ func (s BioSequence) Count() int {
return 1
}
func (s BioSequence) Taxid() int {
if s.sequence.annotations == nil {
func (s *BioSequence) Taxid() int {
if s.annotations == nil {
return 1
}
if val, ok := (s.sequence.annotations)["taxid"]; ok {
if val, ok := (s.annotations)["taxid"]; ok {
val, err := goutils.InterfaceToInt(val)
if err == nil {
return val
@@ -177,56 +203,56 @@ func (s BioSequence) Taxid() int {
return 1
}
func (s BioSequence) SetId(id string) {
s.sequence.id = id
func (s *BioSequence) SetId(id string) {
s.id = id
}
func (s BioSequence) SetDefinition(definition string) {
s.sequence.definition = definition
func (s *BioSequence) SetDefinition(definition string) {
s.definition = definition
}
func (s BioSequence) SetFeatures(feature []byte) {
if cap(s.sequence.feature) >= 300 {
RecycleSlice(&s.sequence.feature)
func (s *BioSequence) SetFeatures(feature []byte) {
if cap(s.feature) >= 300 {
RecycleSlice(&s.feature)
}
s.sequence.feature = feature
s.feature = feature
}
func (s BioSequence) SetSequence(sequence []byte) {
if s.sequence.sequence != nil {
RecycleSlice(&s.sequence.sequence)
func (s *BioSequence) SetSequence(sequence []byte) {
if s.sequence != nil {
RecycleSlice(&s.sequence)
}
s.sequence.sequence = sequence
s.sequence = sequence
}
func (s BioSequence) SetQualities(qualities Quality) {
if s.sequence.qualities != nil {
RecycleSlice(&s.sequence.qualities)
func (s *BioSequence) SetQualities(qualities Quality) {
if s.qualities != nil {
RecycleSlice(&s.qualities)
}
s.sequence.qualities = qualities
s.qualities = qualities
}
func (s BioSequence) WriteQualities(data []byte) (int, error) {
s.sequence.qualities = append(s.sequence.qualities, data...)
func (s *BioSequence) WriteQualities(data []byte) (int, error) {
s.qualities = append(s.qualities, data...)
return len(data), nil
}
func (s BioSequence) WriteByteQualities(data byte) error {
s.sequence.qualities = append(s.sequence.qualities, data)
func (s *BioSequence) WriteByteQualities(data byte) error {
s.qualities = append(s.qualities, data)
return nil
}
func (s BioSequence) Write(data []byte) (int, error) {
s.sequence.sequence = append(s.sequence.sequence, data...)
func (s *BioSequence) Write(data []byte) (int, error) {
s.sequence = append(s.sequence, data...)
return len(data), nil
}
func (s BioSequence) WriteString(data string) (int, error) {
func (s *BioSequence) WriteString(data string) (int, error) {
bdata := []byte(data)
return s.Write(bdata)
}
func (s BioSequence) WriteByte(data byte) error {
s.sequence.sequence = append(s.sequence.sequence, data)
func (s *BioSequence) WriteByte(data byte) error {
s.sequence = append(s.sequence, data)
return nil
}

View File

@@ -1,3 +1,58 @@
package obiseq
type BioSequenceSlice []BioSequence
import (
"sync"
)
type BioSequenceSlice []*BioSequence
var _BioSequenceSlicePool = sync.Pool{
New: func() interface{} {
bs := make(BioSequenceSlice, 0, 10)
return &bs
},
}
func NewBioSequenceSlice() *BioSequenceSlice {
return _BioSequenceSlicePool.Get().(*BioSequenceSlice)
}
func MakeBioSequenceSlice() BioSequenceSlice {
return *NewBioSequenceSlice()
}
func (s *BioSequenceSlice) Recycle() {
// if s == nil {
// log.Panicln("Trying too recycle a nil pointer")
// }
// // Code added to potentially limit memory leaks
// for i := range *s {
// (*s)[i] = nil
// }
// *s = (*s)[:0]
// _BioSequenceSlicePool.Put(s)
}
func (s *BioSequenceSlice) Push(sequence *BioSequence) {
*s = append(*s, sequence)
}
func (s *BioSequenceSlice) Pop() *BioSequence {
_s := (*s)[len(*s)-1]
(*s)[len(*s)-1] = nil
*s = (*s)[:len(*s)-1]
return _s
}
func (s *BioSequenceSlice) Pop0() *BioSequence {
_s := (*s)[0]
(*s)[0] = nil
*s = (*s)[1:]
return _s
}
func (s BioSequenceSlice) NotEmpty() bool {
return len(s) > 0
}

View File

@@ -9,19 +9,19 @@ import (
)
type BioSequenceClassifier struct {
Code func(BioSequence) int
Code func(*BioSequence) int
Value func(int) string
Reset func()
Clone func() *BioSequenceClassifier
}
//type BioSequenceClassifier func(sequence BioSequence) string
func AnnotationClassifier(key string, na string) *BioSequenceClassifier {
encode := make(map[string]int, 1000)
decode := make([]string, 0, 1000)
locke := sync.RWMutex{}
maxcode := 0
code := func(sequence BioSequence) int {
code := func(sequence *BioSequence) int {
var val string
if sequence.HasAnnotation() {
value, ok := sequence.Annotations()[key]
@@ -62,12 +62,26 @@ func AnnotationClassifier(key string, na string) *BioSequenceClassifier {
return decode[k]
}
c := BioSequenceClassifier{code, value}
reset := func() {
locke.Lock()
defer locke.Unlock()
for k := range encode {
delete(encode, k)
}
decode = decode[:0]
}
clone := func() *BioSequenceClassifier {
return AnnotationClassifier(key, na)
}
c := BioSequenceClassifier{code, value, reset, clone}
return &c
}
func PredicateClassifier(predicate SequencePredicate) *BioSequenceClassifier {
code := func(sequence BioSequence) int {
code := func(sequence *BioSequence) int {
if predicate(sequence) {
return 1
} else {
@@ -85,14 +99,22 @@ func PredicateClassifier(predicate SequencePredicate) *BioSequenceClassifier {
}
c := BioSequenceClassifier{code, value}
reset := func() {
}
clone := func() *BioSequenceClassifier {
return PredicateClassifier(predicate)
}
c := BioSequenceClassifier{code, value, reset, clone}
return &c
}
// Builds a classifier function based on CRC32 of the sequence
//
func HashClassifier(size int) *BioSequenceClassifier {
code := func(sequence BioSequence) int {
code := func(sequence *BioSequence) int {
return int(crc32.ChecksumIEEE(sequence.Sequence()) % uint32(size))
}
@@ -100,7 +122,15 @@ func HashClassifier(size int) *BioSequenceClassifier {
return strconv.Itoa(k)
}
c := BioSequenceClassifier{code, value}
reset := func() {
}
clone := func() *BioSequenceClassifier {
return HashClassifier(size)
}
c := BioSequenceClassifier{code, value, reset, clone}
return &c
}
@@ -112,7 +142,7 @@ func SequenceClassifier() *BioSequenceClassifier {
locke := sync.RWMutex{}
maxcode := 0
code := func(sequence BioSequence) int {
code := func(sequence *BioSequence) int {
val := sequence.String()
locke.Lock()
@@ -140,7 +170,23 @@ func SequenceClassifier() *BioSequenceClassifier {
return decode[k]
}
c := BioSequenceClassifier{code, value}
reset := func() {
locke.Lock()
defer locke.Unlock()
// for k := range encode {
// delete(encode, k)
// }
encode = make(map[string]int)
decode = decode[:0]
maxcode = 0
}
clone := func() *BioSequenceClassifier {
return SequenceClassifier()
}
c := BioSequenceClassifier{code, value, reset, clone}
return &c
}
@@ -148,7 +194,7 @@ func RotateClassifier(size int) *BioSequenceClassifier {
n := 0
lock := sync.Mutex{}
code := func(sequence BioSequence) int {
code := func(sequence *BioSequence) int {
lock.Lock()
defer lock.Unlock()
n = n % size
@@ -160,6 +206,14 @@ func RotateClassifier(size int) *BioSequenceClassifier {
return strconv.Itoa(k)
}
c := BioSequenceClassifier{code, value}
reset := func() {
}
clone := func() *BioSequenceClassifier {
return RotateClassifier(size)
}
c := BioSequenceClassifier{code, value, reset, clone}
return &c
}

View File

@@ -6,9 +6,10 @@ import (
)
type IDistribute struct {
outputs map[int]IBioSequenceBatch
news chan int
lock *sync.Mutex
outputs map[int]IBioSequenceBatch
news chan int
classifier *BioSequenceClassifier
lock *sync.Mutex
}
func (dist *IDistribute) Outputs(key int) (IBioSequenceBatch, error) {
@@ -27,6 +28,10 @@ func (dist *IDistribute) News() chan int {
return dist.news
}
func (dist *IDistribute) Classifier() *BioSequenceClassifier {
return dist.classifier
}
func (iterator IBioSequenceBatch) Distribute(class *BioSequenceClassifier, sizes ...int) IDistribute {
batchsize := 5000
buffsize := 2
@@ -53,7 +58,7 @@ func (iterator IBioSequenceBatch) Distribute(class *BioSequenceClassifier, sizes
jobDone.Wait()
close(news)
for _, i := range outputs {
close(i.Channel())
i.Close()
}
}()
@@ -67,7 +72,7 @@ func (iterator IBioSequenceBatch) Distribute(class *BioSequenceClassifier, sizes
slice, ok := slices[key]
if !ok {
s := GetBioSequenceSlice()
s := MakeBioSequenceSlice()
slice = &s
slices[key] = slice
orders[key] = 0
@@ -82,9 +87,9 @@ func (iterator IBioSequenceBatch) Distribute(class *BioSequenceClassifier, sizes
*slice = append(*slice, s)
if len(*slice) == batchsize {
outputs[key].Channel() <- MakeBioSequenceBatch(orders[key], *slice...)
outputs[key].Push(MakeBioSequenceBatch(orders[key], *slice))
orders[key]++
s := GetBioSequenceSlice()
s := MakeBioSequenceSlice()
slices[key] = &s
}
}
@@ -93,7 +98,7 @@ func (iterator IBioSequenceBatch) Distribute(class *BioSequenceClassifier, sizes
for key, slice := range slices {
if len(*slice) > 0 {
outputs[key].Channel() <- MakeBioSequenceBatch(orders[key], *slice...)
outputs[key].Push(MakeBioSequenceBatch(orders[key], *slice))
}
}
@@ -104,6 +109,7 @@ func (iterator IBioSequenceBatch) Distribute(class *BioSequenceClassifier, sizes
return IDistribute{
outputs,
news,
class,
&lock}
}

View File

@@ -2,14 +2,13 @@ package obiseq
import (
"sync"
"time"
)
// Private structure implementing an iterator over
// bioseq.BioSequence based on a channel.
type __ibiosequence__ struct {
channel chan BioSequence
current BioSequence
channel chan *BioSequence
current *BioSequence
pushBack bool
all_done *sync.WaitGroup
buffer_size int
@@ -39,10 +38,10 @@ func (iterator IBioSequence) Wait() {
iterator.pointer.all_done.Wait()
}
func (iterator IBioSequence) Channel() chan BioSequence {
func (iterator IBioSequence) Channel() chan *BioSequence {
return iterator.pointer.channel
}
func (iterator IBioSequence) PChannel() *chan BioSequence {
func (iterator IBioSequence) PChannel() *chan *BioSequence {
return &(iterator.pointer.channel)
}
@@ -54,8 +53,8 @@ func MakeIBioSequence(sizes ...int) IBioSequence {
}
i := __ibiosequence__{
channel: make(chan BioSequence, buffsize),
current: NilBioSequence,
channel: make(chan *BioSequence, buffsize),
current: nil,
pushBack: false,
buffer_size: buffsize,
finished: false,
@@ -73,7 +72,7 @@ func (iterator IBioSequence) Split() IBioSequence {
i := __ibiosequence__{
channel: iterator.pointer.channel,
current: NilBioSequence,
current: nil,
pushBack: false,
finished: false,
all_done: iterator.pointer.all_done,
@@ -87,7 +86,7 @@ func (iterator IBioSequence) Split() IBioSequence {
func (iterator IBioSequence) Next() bool {
if iterator.IsNil() || *(iterator.pointer.pFinished) {
iterator.pointer.current = NilBioSequence
iterator.pointer.current = nil
return false
}
@@ -103,13 +102,13 @@ func (iterator IBioSequence) Next() bool {
return true
}
iterator.pointer.current = NilBioSequence
iterator.pointer.current = nil
*iterator.pointer.pFinished = true
return false
}
func (iterator IBioSequence) PushBack() {
if !iterator.pointer.current.IsNil() {
if !(iterator.pointer.current == nil) {
iterator.pointer.pushBack = true
}
}
@@ -118,7 +117,7 @@ func (iterator IBioSequence) PushBack() {
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator IBioSequence) Get() BioSequence {
func (iterator IBioSequence) Get() *BioSequence {
return iterator.pointer.current
}
@@ -156,17 +155,13 @@ func (iterator IBioSequence) IBioSequenceBatch(sizes ...int) IBioSequenceBatch {
newIter.Add(1)
go func() {
newIter.Wait()
for len(newIter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(newIter.pointer.channel)
newIter.WaitAndClose()
}()
go func() {
for j := 0; !iterator.Finished(); j++ {
batch := BioSequenceBatch{
slice: GetBioSequenceSlice(),
slice: MakeBioSequenceSlice(),
order: j}
for i := 0; i < batchsize && iterator.Next(); i++ {
seq := iterator.Get()
@@ -280,7 +275,7 @@ func (iterator IBioSequence) Tail(n int, sizes ...int) IBioSequence {
}
newIter := MakeIBioSequence(buffsize)
buffseq := GetBioSequenceSlice()
buffseq := MakeBioSequenceSlice()
newIter.Add(1)

View File

@@ -1,6 +1,6 @@
package obiseq
func (sequence BioSequence) Join(seq2 BioSequence, inplace bool) BioSequence {
func (sequence *BioSequence) Join(seq2 *BioSequence, inplace bool) *BioSequence {
if !inplace {
sequence = sequence.Copy()

View File

@@ -8,7 +8,7 @@ import (
type StatsOnValues map[string]int
func (sequence BioSequence) HasStatsOn(key string) bool {
func (sequence *BioSequence) HasStatsOn(key string) bool {
if !sequence.HasAnnotation() {
return false
}
@@ -20,7 +20,7 @@ func (sequence BioSequence) HasStatsOn(key string) bool {
return ok
}
func (sequence BioSequence) StatsOn(key string, na string) StatsOnValues {
func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
mkey := "merged_" + key
annotations := sequence.Annotations()
istat, ok := annotations[mkey]
@@ -51,9 +51,9 @@ func (sequence BioSequence) StatsOn(key string, na string) StatsOnValues {
return stats
}
func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence, na string) bool {
func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na string) bool {
sval := na
stats := sequence.StatsOn(key,na)
stats := sequence.StatsOn(key, na)
retval := false
if toAdd.HasAnnotation() {
@@ -97,7 +97,7 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
return stats
}
func (sequence BioSequence) Merge(tomerge BioSequence, na string, inplace bool, statsOn ...string) BioSequence {
func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn ...string) *BioSequence {
if !inplace {
sequence = sequence.Copy()
}
@@ -112,11 +112,11 @@ func (sequence BioSequence) Merge(tomerge BioSequence, na string, inplace bool,
for _, key := range statsOn {
if tomerge.HasStatsOn(key) {
smk := sequence.StatsOn(key,na)
mmk := tomerge.StatsOn(key,na)
smk := sequence.StatsOn(key, na)
mmk := tomerge.StatsOn(key, na)
smk.Merge(mmk)
} else {
sequence.StatsPlusOne(key, tomerge,na)
sequence.StatsPlusOne(key, tomerge, na)
}
}
@@ -143,24 +143,63 @@ func (sequence BioSequence) Merge(tomerge BioSequence, na string, inplace bool,
return sequence
}
func (sequences BioSequenceSlice) Merge(na string, statsOn ...string) BioSequenceSlice {
func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequence {
seq := sequences[0]
//sequences[0] = nil
seq.SetQualities(nil)
seq.Annotations()["count"] = 1
for _, toMerge := range sequences[1:] {
seq.Merge(toMerge, na, true, statsOn...)
toMerge.Recycle()
if len(sequences) == 1 {
seq.Annotations()["count"] = 1
for _, v := range statsOn {
seq.StatsOn(v, na)
}
} else {
for k, toMerge := range sequences[1:] {
seq.Merge(toMerge, na, true, statsOn...)
toMerge.Recycle()
sequences[1+k] = nil
}
}
return sequences[0:1]
sequences.Recycle()
return seq
}
func MergeSliceWorker(na string, statsOn ...string) SeqSliceWorker {
func (iterator IBioSequenceBatch) IMergeSequenceBatch(na string, statsOn []string, sizes ...int) IBioSequenceBatch {
batchsize := 100
buffsize := iterator.BufferSize()
worker := func(sequences BioSequenceSlice) BioSequenceSlice {
return sequences.Merge(na, statsOn...)
if len(sizes) > 0 {
batchsize = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
return worker
newIter := MakeIBioSequenceBatch(buffsize)
newIter.Add(1)
go func() {
newIter.WaitAndClose()
}()
go func() {
for j := 0; !iterator.Finished(); j++ {
batch := BioSequenceBatch{
slice: MakeBioSequenceSlice(),
order: j}
for i := 0; i < batchsize && iterator.Next(); i++ {
seqs := iterator.Get()
batch.slice = append(batch.slice, seqs.slice.Merge(na, statsOn))
}
if batch.Length() > 0 {
newIter.Push(batch)
}
}
newIter.Done()
}()
return newIter
}

View File

@@ -14,8 +14,10 @@ var _BioSequenceByteSlicePool = sync.Pool{
}
func RecycleSlice(s *[]byte) {
*s = (*s)[:0]
_BioSequenceByteSlicePool.Put(s)
if s != nil && *s != nil {
*s = (*s)[:0]
_BioSequenceByteSlicePool.Put(s)
}
}
func GetSlice(values ...byte) []byte {
@@ -30,7 +32,7 @@ func GetSlice(values ...byte) []byte {
var BioSequenceAnnotationPool = sync.Pool{
New: func() interface{} {
bs := make(Annotation, 100)
bs := make(Annotation, 5)
return &bs
},
}
@@ -40,12 +42,16 @@ func RecycleAnnotation(a *Annotation) {
for k := range *a {
delete(*a, k)
}
BioSequenceAnnotationPool.Put(&(a))
BioSequenceAnnotationPool.Put(a)
}
}
func GetAnnotation(values ...Annotation) Annotation {
a := *(BioSequenceAnnotationPool.Get().(*Annotation))
a := Annotation(nil)
for a == nil {
a = *(BioSequenceAnnotationPool.Get().(*Annotation))
}
if len(values) > 0 {
goutils.CopyMap(a, values[0])
@@ -53,58 +59,3 @@ func GetAnnotation(values ...Annotation) Annotation {
return a
}
var _BioSequenceSlicePool = sync.Pool{
New: func() interface{} {
bs := make(BioSequenceSlice, 0, 5000)
return &bs
},
}
func (s *BioSequenceSlice) Recycle() {
*s = (*s)[:0]
_BioSequenceSlicePool.Put(s)
}
func GetBioSequenceSlicePtr(values ...BioSequence) *BioSequenceSlice {
s := _BioSequenceSlicePool.Get().(*BioSequenceSlice)
if len(values) > 0 {
*s = append(*s, values...)
}
return s
}
func GetBioSequenceSlice(values ...BioSequence) BioSequenceSlice {
return *GetBioSequenceSlicePtr(values...)
}
// var __bioseq__pool__ = sync.Pool{
// New: func() interface{} {
// var bs _BioSequence
// bs.annotations = make(Annotation, 50)
// return &bs
// },
// }
// func MakeEmptyBioSequence() BioSequence {
// bs := BioSequence{__bioseq__pool__.Get().(*_BioSequence)}
// return bs
// }
// func MakeBioSequence(id string,
// sequence []byte,
// definition string) BioSequence {
// bs := MakeEmptyBioSequence()
// bs.SetId(id)
// bs.Write(sequence)
// bs.SetDefinition(definition)
// return bs
// }
// func (sequence *BioSequence) Recycle() {
// sequence.Reset()
// __bioseq__pool__.Put(sequence.sequence)
// sequence.sequence = nil
// }

View File

@@ -1,9 +1,9 @@
package obiseq
type SequencePredicate func(BioSequence) bool
type SequencePredicate func(*BioSequence) bool
func (predicate1 SequencePredicate) And(predicate2 SequencePredicate) SequencePredicate {
f := func(sequence BioSequence) bool {
f := func(sequence *BioSequence) bool {
return predicate1(sequence) && predicate2(sequence)
}
@@ -11,7 +11,7 @@ func (predicate1 SequencePredicate) And(predicate2 SequencePredicate) SequencePr
}
func (predicate1 SequencePredicate) Or(predicate2 SequencePredicate) SequencePredicate {
f := func(sequence BioSequence) bool {
f := func(sequence *BioSequence) bool {
return predicate1(sequence) || predicate2(sequence)
}
@@ -19,7 +19,7 @@ func (predicate1 SequencePredicate) Or(predicate2 SequencePredicate) SequencePre
}
func (predicate1 SequencePredicate) Xor(predicate2 SequencePredicate) SequencePredicate {
f := func(sequence BioSequence) bool {
f := func(sequence *BioSequence) bool {
p1 := predicate1(sequence)
p2 := predicate2(sequence)
return (p1 && !p2) || (p2 && !p1)
@@ -29,7 +29,7 @@ func (predicate1 SequencePredicate) Xor(predicate2 SequencePredicate) SequencePr
}
func (predicate1 SequencePredicate) Not() SequencePredicate {
f := func(sequence BioSequence) bool {
f := func(sequence *BioSequence) bool {
return !predicate1(sequence)
}
@@ -38,7 +38,7 @@ func (predicate1 SequencePredicate) Not() SequencePredicate {
func HasAttribute(name string) SequencePredicate {
f := func(sequence BioSequence) bool {
f := func(sequence *BioSequence) bool {
if sequence.HasAnnotation() {
_, ok := (sequence.Annotations())[name]
return ok
@@ -51,7 +51,7 @@ func HasAttribute(name string) SequencePredicate {
}
func MoreAbundantThan(count int) SequencePredicate {
f := func(sequence BioSequence) bool {
f := func(sequence *BioSequence) bool {
return sequence.Count() > count
}
@@ -59,7 +59,7 @@ func MoreAbundantThan(count int) SequencePredicate {
}
func IsLongerOrEqualTo(length int) SequencePredicate {
f := func(sequence BioSequence) bool {
f := func(sequence *BioSequence) bool {
return sequence.Length() >= length
}
@@ -67,7 +67,7 @@ func IsLongerOrEqualTo(length int) SequencePredicate {
}
func IsShorterOrEqualTo(length int) SequencePredicate {
f := func(sequence BioSequence) bool {
f := func(sequence *BioSequence) bool {
return sequence.Length() <= length
}

View File

@@ -5,13 +5,13 @@ var __revcmp_dna__ = []byte(".TVGHEFCDIJMLKNOPQYSAABWXRZ#!][")
// Reverse complements a DNA sequence.
// If the inplace parametter is true, that operation is done in place.
func (sequence BioSequence) ReverseComplement(inplace bool) BioSequence {
func (sequence *BioSequence) ReverseComplement(inplace bool) *BioSequence {
if !inplace {
sequence = sequence.Copy()
}
s := sequence.sequence.sequence
s := sequence.sequence
for i, j := sequence.Length()-1, 0; i >= j; i-- {

View File

@@ -1,6 +1,39 @@
package obiseq
func (iterator IBioSequenceBatch) speed() IBioSequenceBatch {
import (
"os"
"github.com/schollz/progressbar/v3"
)
func (iterator IBioSequenceBatch) Speed() IBioSequenceBatch {
newIter := MakeIBioSequenceBatch()
newIter.Add(1)
go func() {
newIter.WaitAndClose()
}()
bar := progressbar.NewOptions(
-1,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowCount(),
progressbar.OptionShowIts(),
progressbar.OptionSetDescription("[Sequence Processing]"))
go func() {
for iterator.Next() {
batch := iterator.Get()
l := batch.Length()
newIter.Push(batch)
bar.Add(l)
}
newIter.Done()
}()
return newIter
}

View File

@@ -7,32 +7,32 @@ import (
// Returns a sub sequence start from position 'from' included,
// to position 'to' excluded. Coordinates start at position 0.
func (sequence BioSequence) Subsequence(from, to int, circular bool) (BioSequence, error) {
func (sequence *BioSequence) Subsequence(from, to int, circular bool) (*BioSequence, error) {
if from >= to && !circular {
return NilBioSequence, errors.New("from greater than to")
return nil, errors.New("from greater than to")
}
if from < 0 || from >= sequence.Length() {
return NilBioSequence, errors.New("from out of bounds")
return nil, errors.New("from out of bounds")
}
if to <= 0 || to > sequence.Length() {
return NilBioSequence, errors.New("to out of bounds")
return nil, errors.New("to out of bounds")
}
var newSeq BioSequence
var newSeq *BioSequence
if from < to {
newSeq = MakeEmptyBioSequence()
newSeq = NewEmptyBioSequence()
newSeq.Write(sequence.Sequence()[from:to])
if sequence.HasQualities() {
newSeq.WriteQualities(sequence.Qualities()[from:to])
}
newSeq.sequence.id = fmt.Sprintf("%s_sub[%d..%d]", sequence.Id(), from+1, to)
newSeq.sequence.definition = sequence.sequence.definition
newSeq.id = fmt.Sprintf("%s_sub[%d..%d]", sequence.Id(), from+1, to)
newSeq.definition = sequence.definition
} else {
newSeq, _ = sequence.Subsequence(from, sequence.Length(), false)
newSeq.Write(sequence.Sequence()[0:to])
@@ -44,7 +44,7 @@ func (sequence BioSequence) Subsequence(from, to int, circular bool) (BioSequenc
}
if len(sequence.Annotations()) > 0 {
newSeq.sequence.annotations = GetAnnotation(sequence.Annotations())
newSeq.annotations = GetAnnotation(sequence.Annotations())
}
return newSeq, nil

View File

@@ -2,16 +2,15 @@ package obiseq
import (
"log"
"time"
)
type SeqAnnotator func(BioSequence)
type SeqAnnotator func(*BioSequence)
type SeqWorker func(BioSequence) BioSequence
type SeqWorker func(*BioSequence) *BioSequence
type SeqSliceWorker func(BioSequenceSlice) BioSequenceSlice
func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
f := func(seq BioSequence) BioSequence {
f := func(seq *BioSequence) *BioSequence {
function(seq)
return seq
}
@@ -63,11 +62,7 @@ func (iterator IBioSequenceBatch) MakeIWorker(worker SeqWorker, sizes ...int) IB
newIter.Add(nworkers)
go func() {
newIter.Wait()
for len(newIter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(newIter.pointer.channel)
newIter.WaitAndClose()
log.Println("End of the batch workers")
}()
@@ -78,7 +73,7 @@ func (iterator IBioSequenceBatch) MakeIWorker(worker SeqWorker, sizes ...int) IB
for i, seq := range batch.slice {
batch.slice[i] = worker(seq)
}
newIter.pointer.channel <- batch
newIter.Push(batch)
}
newIter.Done()
}
@@ -109,11 +104,7 @@ func (iterator IBioSequenceBatch) MakeISliceWorker(worker SeqSliceWorker, sizes
newIter.Add(nworkers)
go func() {
newIter.Wait()
for len(newIter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(newIter.pointer.channel)
newIter.WaitAndClose()
log.Println("End of the batch slice workers")
}()