First commit

This commit is contained in:
2022-01-13 23:27:39 +01:00
parent dab6549cad
commit f53bf1b804
93 changed files with 11042 additions and 0 deletions

358
pkg/obiseq/batchiterator.go Normal file
View File

@@ -0,0 +1,358 @@
package obiseq
import (
"log"
"sync"
)
type BioSequenceBatch struct {
slice BioSequenceSlice
order int
}
var NilBioSequenceBatch = BioSequenceBatch{nil, -1}
func MakeBioSequenceBatch(order int, sequences ...BioSequence) BioSequenceBatch {
return BioSequenceBatch{
slice: sequences,
order: order,
}
}
func (batch BioSequenceBatch) Order() int {
return batch.order
}
func (batch BioSequenceBatch) Slice() BioSequenceSlice {
return batch.slice
}
func (batch BioSequenceBatch) Length() int {
return len(batch.slice)
}
func (batch BioSequenceBatch) IsNil() bool {
return batch.slice == nil
}
// Structure implementing an iterator over bioseq.BioSequenceBatch
// based on a channel.
type __ibiosequencebatch__ struct {
channel chan BioSequenceBatch
current BioSequenceBatch
all_done *sync.WaitGroup
buffer_size int
finished bool
p_finished *bool
}
type IBioSequenceBatch struct {
pointer *__ibiosequencebatch__
}
var NilIBioSequenceBatch = IBioSequenceBatch{pointer: nil}
func MakeIBioSequenceBatch(sizes ...int) IBioSequenceBatch {
buffsize := 1
if len(sizes) > 0 {
buffsize = sizes[0]
}
i := __ibiosequencebatch__{
channel: make(chan BioSequenceBatch, buffsize),
current: NilBioSequenceBatch,
buffer_size: buffsize,
finished: false,
p_finished: nil}
i.p_finished = &i.finished
waiting := sync.WaitGroup{}
i.all_done = &waiting
ii := IBioSequenceBatch{&i}
return ii
}
func (iterator IBioSequenceBatch) Add(n int) {
iterator.pointer.all_done.Add(n)
}
func (iterator IBioSequenceBatch) Done() {
iterator.pointer.all_done.Done()
}
func (iterator IBioSequenceBatch) Wait() {
iterator.pointer.all_done.Wait()
}
func (iterator IBioSequenceBatch) Channel() chan BioSequenceBatch {
return iterator.pointer.channel
}
func (iterator IBioSequenceBatch) IsNil() bool {
return iterator.pointer == nil
}
func (iterator IBioSequenceBatch) BufferSize() int {
return iterator.pointer.buffer_size
}
func (iterator IBioSequenceBatch) Split() IBioSequenceBatch {
i := __ibiosequencebatch__{
channel: iterator.pointer.channel,
current: NilBioSequenceBatch,
all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size,
finished: false,
p_finished: iterator.pointer.p_finished}
new_iter := IBioSequenceBatch{&i}
return new_iter
}
func (iterator IBioSequenceBatch) Next() bool {
if *(iterator.pointer.p_finished) {
return false
}
next, ok := (<-iterator.pointer.channel)
if ok {
iterator.pointer.current = next
return true
}
iterator.pointer.current = NilBioSequenceBatch
*iterator.pointer.p_finished = true
return false
}
// The 'Get' method returns the instance of BioSequenceBatch
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator IBioSequenceBatch) Get() BioSequenceBatch {
return iterator.pointer.current
}
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator IBioSequenceBatch) Finished() bool {
return *iterator.pointer.p_finished
}
func (iterator IBioSequenceBatch) IBioSequence(sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
for iterator.Next() {
batch := iterator.Get()
for _, s := range batch.slice {
new_iter.pointer.channel <- s
}
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequenceBatch) SortBatches(sizes ...int) IBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
next_to_send := 0
received := make(map[int]BioSequenceBatch)
go func() {
for iterator.Next() {
batch := iterator.Get()
if batch.order == next_to_send {
new_iter.pointer.channel <- batch
next_to_send++
batch, ok := received[next_to_send]
for ok {
new_iter.pointer.channel <- batch
delete(received, next_to_send)
next_to_send++
batch, ok = received[next_to_send]
}
} else {
received[batch.order] = batch
}
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequenceBatch) Concat(iterators ...IBioSequenceBatch) IBioSequenceBatch {
if len(iterators) == 0 {
return iterator
}
buffsize := iterator.BufferSize()
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.Channel())
}()
go func() {
previous_max := 0
max_order := 0
for iterator.Next() {
s := iterator.Get()
if s.order > max_order {
max_order = s.order
}
new_iter.Channel() <- MakeBioSequenceBatch(s.order+previous_max, s.slice...)
}
previous_max = max_order + 1
for _, iter := range iterators {
for iter.Next() {
s := iter.Get()
if (s.order + previous_max) > max_order {
max_order = s.order + previous_max
}
new_iter.Channel() <- MakeBioSequenceBatch(s.order+previous_max, s.slice...)
}
previous_max = max_order + 1
}
new_iter.Done()
}()
return new_iter
}
// Redistributes sequences from a IBioSequenceBatch into a new
// IBioSequenceBatch with every batches having the same size
// indicated in parameter. Rebatching implies to sort the
// source IBioSequenceBatch.
func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
order := 0
iterator = iterator.SortBatches()
buffer := make(BioSequenceSlice, 0, size)
for iterator.Next() {
seqs := iterator.Get()
for _, s := range seqs.slice {
buffer = append(buffer, s)
if len(buffer) == size {
new_iter.Channel() <- MakeBioSequenceBatch(order, buffer...)
order++
buffer = make(BioSequenceSlice, 0, size)
}
}
}
if len(buffer) > 0 {
new_iter.Channel() <- MakeBioSequenceBatch(order, buffer...)
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequenceBatch) Destroy() {
log.Println("Start recycling of Bioseq objects")
for iterator.Next() {
batch := iterator.Get()
for _, seq := range batch.Slice() {
(&seq).Destroy()
}
}
log.Println("End of the recycling of Bioseq objects")
}
func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch, sizes ...int) IPairedBioSequenceBatch {
buffsize := iterator.BufferSize()
batchsize := 5000
if len(sizes) > 0 {
batchsize = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
iterator = iterator.Rebatch(batchsize)
reverse = reverse.Rebatch(batchsize)
new_iter := MakeIPairedBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
log.Println("End of association of paired reads")
}()
log.Println("Start association of paired reads")
go func() {
for iterator.Next() {
if !reverse.Next() {
log.Panicln("Etrange reverse pas prêt")
}
new_iter.Channel() <- MakePairedBioSequenceBatch(iterator.Get(),
reverse.Get())
}
new_iter.Done()
}()
return new_iter
}

185
pkg/obiseq/biosequence.go Normal file
View File

@@ -0,0 +1,185 @@
package obiseq
import (
"bytes"
"crypto/md5"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/goutils"
)
type Quality []uint8
var __default_qualities__ = make(Quality, 0, 500)
func __make_default_qualities__(length int) Quality {
cl := len(__default_qualities__)
if cl < length {
for i := cl; i <= length; i++ {
__default_qualities__ = append(__default_qualities__, 40)
}
}
return __default_qualities__[0:length]
}
type Annotation map[string]interface{}
type __sequence__ struct {
id bytes.Buffer
definition bytes.Buffer
sequence bytes.Buffer
qualities bytes.Buffer
feature bytes.Buffer
annotations Annotation
}
type BioSequence struct {
sequence *__sequence__
}
type BioSequenceSlice []BioSequence
var NilBioSequence = BioSequence{sequence: nil}
func (s BioSequence) IsNil() bool {
return s.sequence == nil
}
func (s BioSequence) Reset() {
s.sequence.id.Reset()
s.sequence.definition.Reset()
s.sequence.sequence.Reset()
s.sequence.qualities.Reset()
s.sequence.feature.Reset()
for k := range s.sequence.annotations {
delete(s.sequence.annotations, k)
}
}
func (s BioSequence) Copy() BioSequence {
new_seq := MakeEmptyBioSequence()
new_seq.sequence.id.Write(s.sequence.id.Bytes())
new_seq.sequence.definition.Write(s.sequence.definition.Bytes())
new_seq.sequence.sequence.Write(s.sequence.sequence.Bytes())
new_seq.sequence.qualities.Write(s.sequence.qualities.Bytes())
new_seq.sequence.feature.Write(s.sequence.feature.Bytes())
if len(s.sequence.annotations) > 0 {
goutils.CopyMap(new_seq.sequence.annotations,
s.sequence.annotations)
}
return new_seq
}
func (s BioSequence) Id() string {
return s.sequence.id.String()
}
func (s BioSequence) Definition() string {
return s.sequence.definition.String()
}
func (s BioSequence) Sequence() []byte {
return s.sequence.sequence.Bytes()
}
func (s BioSequence) String() string {
return s.sequence.sequence.String()
}
func (s BioSequence) Length() int {
return s.sequence.sequence.Len()
}
func (s BioSequence) HasQualities() bool {
return s.sequence.qualities.Len() > 0
}
func (s BioSequence) Qualities() Quality {
if s.HasQualities() {
return s.sequence.qualities.Bytes()
} else {
return __make_default_qualities__(s.sequence.sequence.Len())
}
}
func (s BioSequence) Features() string {
return s.sequence.feature.String()
}
func (s BioSequence) Annotations() Annotation {
return s.sequence.annotations
}
func (s BioSequence) MD5() [16]byte {
return md5.Sum(s.sequence.sequence.Bytes())
}
func (s BioSequence) Count() int {
if s.sequence.annotations == nil {
return 1
}
if val, ok := (s.sequence.annotations)["count"]; ok {
val, err := goutils.InterfaceToInt(val)
if err == nil {
return val
}
}
return 1
}
func (s BioSequence) Taxid() int {
if s.sequence.annotations == nil {
return 1
}
if val, ok := (s.sequence.annotations)["taxid"]; ok {
val, err := goutils.InterfaceToInt(val)
if err == nil {
return val
}
}
return 1
}
func (s BioSequence) SetId(id string) {
s.sequence.id.Reset()
s.sequence.id.WriteString(id)
}
func (s BioSequence) SetDefinition(definition string) {
s.sequence.definition.Reset()
s.sequence.definition.WriteString(definition)
}
func (s BioSequence) SetFeatures(feature string) {
s.sequence.feature.Reset()
s.sequence.feature.WriteString(feature)
}
func (s BioSequence) SetSequence(sequence []byte) {
s.sequence.sequence.Reset()
s.sequence.sequence.Write(sequence)
}
func (s BioSequence) SetQualities(qualities Quality) {
s.sequence.qualities.Reset()
s.sequence.qualities.Write(qualities)
}
func (s BioSequence) Write(data []byte) (int, error) {
return s.sequence.sequence.Write(data)
}
func (s BioSequence) WriteString(data string) (int, error) {
return s.sequence.sequence.WriteString(data)
}
func (s BioSequence) WriteByte(data byte) error {
return s.sequence.sequence.WriteByte(data)
}
func (s BioSequence) WriteRune(data rune) (int, error) {
return s.sequence.sequence.WriteRune(data)
}

326
pkg/obiseq/iterator.go Normal file
View File

@@ -0,0 +1,326 @@
package obiseq
import (
"sync"
"time"
)
// Private structure implementing an iterator over
// bioseq.BioSequence based on a channel.
type __ibiosequence__ struct {
channel chan BioSequence
current BioSequence
all_done *sync.WaitGroup
buffer_size int
finished bool
p_finished *bool
}
type IBioSequence struct {
pointer *__ibiosequence__
}
var NilIBioSequence = IBioSequence{pointer: nil}
func (iterator IBioSequence) IsNil() bool {
return iterator.pointer == nil
}
func (iterator IBioSequence) Add(n int) {
iterator.pointer.all_done.Add(n)
}
func (iterator IBioSequence) Done() {
iterator.pointer.all_done.Done()
}
func (iterator IBioSequence) Wait() {
iterator.pointer.all_done.Wait()
}
func (iterator IBioSequence) Channel() chan BioSequence {
return iterator.pointer.channel
}
func (iterator IBioSequence) PChannel() *chan BioSequence {
return &(iterator.pointer.channel)
}
func MakeIBioSequence(sizes ...int) IBioSequence {
buffsize := 1
if len(sizes) > 0 {
buffsize = sizes[0]
}
i := __ibiosequence__{
channel: make(chan BioSequence, buffsize),
current: NilBioSequence,
buffer_size: buffsize,
finished: false,
p_finished: nil}
i.p_finished = &i.finished
waiting := sync.WaitGroup{}
i.all_done = &waiting
ii := IBioSequence{&i}
return ii
}
func (iterator IBioSequence) Split() IBioSequence {
i := __ibiosequence__{
channel: iterator.pointer.channel,
current: NilBioSequence,
finished: false,
all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size,
p_finished: iterator.pointer.p_finished}
new_iter := IBioSequence{&i}
return new_iter
}
func (iterator IBioSequence) Next() bool {
if iterator.IsNil() || *(iterator.pointer.p_finished) {
iterator.pointer.current = NilBioSequence
return false
}
next, ok := (<-iterator.pointer.channel)
if ok {
iterator.pointer.current = next
return true
}
iterator.pointer.current = NilBioSequence
*iterator.pointer.p_finished = true
return false
}
// The 'Get' method returns the instance of BioSequence
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator IBioSequence) Get() BioSequence {
return iterator.pointer.current
}
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator IBioSequence) Finished() bool {
return *iterator.pointer.p_finished
}
func (iterator IBioSequence) BufferSize() int {
return iterator.pointer.buffer_size
}
// The IBioSequenceBatch converts a IBioSequence iterator
// into an iterator oveer batches oof sequences. By default
// the size of a batch is of 100 sequences and the iterator
// implements a buffer equal to that of the source iterator.
// These defaults can be overriden by specifying one or two
// optional parametters at the method call. The first one
// indicates the batch size. The second optional parametter
// indicates the size of the buffer.
func (iterator IBioSequence) IBioSequenceBatch(sizes ...int) IBioSequenceBatch {
batchsize := 100
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
batchsize = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
for len(new_iter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(new_iter.pointer.channel)
}()
go func() {
for j := 0; !iterator.Finished(); j++ {
batch := BioSequenceBatch{
slice: make(BioSequenceSlice, 0, batchsize),
order: j}
for i := 0; i < batchsize && iterator.Next(); i++ {
seq := iterator.Get()
batch.slice = append(batch.slice, seq)
}
new_iter.pointer.channel <- batch
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequence) IBioSequence(sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
for iterator.Next() {
s := iterator.Get()
new_iter.pointer.channel <- s
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequence) Skip(n int, sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
for i := 0; iterator.Next(); i++ {
if i >= n {
s := iterator.Get()
new_iter.pointer.channel <- s
}
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequence) Head(n int, sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
not_done := true
for i := 0; iterator.Next(); i++ {
if i < n {
s := iterator.Get()
new_iter.pointer.channel <- s
} else {
if not_done {
new_iter.Done()
not_done = false
}
}
}
}()
return new_iter
}
// The 'Tail' method discard every data from the source iterator
// except the 'n' last ones.
func (iterator IBioSequence) Tail(n int, sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
buffseq := make(BioSequenceSlice, n)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
var i int
for i = 0; iterator.Next(); i++ {
buffseq[i%n] = iterator.Get()
}
if i > n {
for j := 0; j < n; j++ {
new_iter.Channel() <- buffseq[(i+j)%n]
}
} else {
for j := 0; j < i; j++ {
new_iter.Channel() <- buffseq[j]
}
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequence) Concat(iterators ...IBioSequence) IBioSequence {
if len(iterators) == 0 {
return iterator
}
buffsize := iterator.BufferSize()
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
for iterator.Next() {
s := iterator.Get()
new_iter.pointer.channel <- s
}
for _, iter := range iterators {
for iter.Next() {
s := iter.Get()
new_iter.pointer.channel <- s
}
}
new_iter.Done()
}()
return new_iter
}

19
pkg/obiseq/join.go Normal file
View File

@@ -0,0 +1,19 @@
package obiseq
import "git.metabarcoding.org/lecasofts/go/oa2/pkg/goutils"
func (sequence BioSequence) Join(seq2 BioSequence, copy_annot bool) (BioSequence, error) {
new_seq := MakeEmptyBioSequence()
new_seq.SetId(sequence.Id())
new_seq.SetDefinition(sequence.Definition())
new_seq.Write(sequence.Sequence())
new_seq.Write(seq2.Sequence())
if copy_annot {
goutils.CopyMap(new_seq.Annotations(), sequence.Annotations())
}
return new_seq, nil
}

View File

@@ -0,0 +1,196 @@
package obiseq
import (
"log"
"sync"
)
type PairedBioSequenceBatch struct {
forward BioSequenceSlice
reverse BioSequenceSlice
order int
}
var NilPairedBioSequenceBatch = PairedBioSequenceBatch{nil, nil, -1}
func MakePairedBioSequenceBatch(forward, reverse BioSequenceBatch) PairedBioSequenceBatch {
if forward.order != reverse.order {
log.Fatalf("Forward order : %d and reverse order : %d are not matching",
forward.order, reverse.order)
}
for i := range reverse.slice {
reverse.slice[i].ReverseComplement(true)
}
return PairedBioSequenceBatch{
forward: forward.slice,
reverse: reverse.slice,
order: forward.order,
}
}
func (batch PairedBioSequenceBatch) Order() int {
return batch.order
}
func (batch PairedBioSequenceBatch) Length() int {
return len(batch.forward)
}
func (batch PairedBioSequenceBatch) Forward() BioSequenceSlice {
return batch.forward
}
func (batch PairedBioSequenceBatch) Reverse() BioSequenceSlice {
return batch.reverse
}
func (batch PairedBioSequenceBatch) IsNil() bool {
return batch.forward == nil
}
// Structure implementing an iterator over bioseq.BioSequenceBatch
// based on a channel.
type __ipairedbiosequencebatch__ struct {
channel chan PairedBioSequenceBatch
current PairedBioSequenceBatch
all_done *sync.WaitGroup
buffer_size int
finished bool
p_finished *bool
}
type IPairedBioSequenceBatch struct {
pointer *__ipairedbiosequencebatch__
}
var NilIPairedBioSequenceBatch = IPairedBioSequenceBatch{pointer: nil}
func MakeIPairedBioSequenceBatch(sizes ...int) IPairedBioSequenceBatch {
buffsize := 1
if len(sizes) > 0 {
buffsize = sizes[0]
}
i := __ipairedbiosequencebatch__{
channel: make(chan PairedBioSequenceBatch, buffsize),
current: NilPairedBioSequenceBatch,
buffer_size: buffsize,
finished: false,
p_finished: nil}
i.p_finished = &i.finished
waiting := sync.WaitGroup{}
i.all_done = &waiting
ii := IPairedBioSequenceBatch{&i}
return ii
}
func (iterator IPairedBioSequenceBatch) Add(n int) {
iterator.pointer.all_done.Add(n)
}
func (iterator IPairedBioSequenceBatch) Done() {
iterator.pointer.all_done.Done()
}
func (iterator IPairedBioSequenceBatch) Wait() {
iterator.pointer.all_done.Wait()
}
func (iterator IPairedBioSequenceBatch) Channel() chan PairedBioSequenceBatch {
return iterator.pointer.channel
}
func (iterator IPairedBioSequenceBatch) IsNil() bool {
return iterator.pointer == nil
}
func (iterator IPairedBioSequenceBatch) BufferSize() int {
return iterator.pointer.buffer_size
}
func (iterator IPairedBioSequenceBatch) Split() IPairedBioSequenceBatch {
i := __ipairedbiosequencebatch__{
channel: iterator.pointer.channel,
current: NilPairedBioSequenceBatch,
all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size,
finished: false,
p_finished: iterator.pointer.p_finished}
new_iter := IPairedBioSequenceBatch{&i}
return new_iter
}
func (iterator IPairedBioSequenceBatch) Next() bool {
if *(iterator.pointer.p_finished) {
return false
}
next, ok := (<-iterator.pointer.channel)
if ok {
iterator.pointer.current = next
return true
}
iterator.pointer.current = NilPairedBioSequenceBatch
*iterator.pointer.p_finished = true
return false
}
// The 'Get' method returns the instance of BioSequenceBatch
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator IPairedBioSequenceBatch) Get() PairedBioSequenceBatch {
return iterator.pointer.current
}
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator IPairedBioSequenceBatch) Finished() bool {
return *iterator.pointer.p_finished
}
func (iterator IPairedBioSequenceBatch) SortBatches(sizes ...int) IPairedBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIPairedBioSequenceBatch(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
next_to_send := 0
received := make(map[int]PairedBioSequenceBatch)
go func() {
for iterator.Next() {
batch := iterator.Get()
if batch.order == next_to_send {
new_iter.pointer.channel <- batch
next_to_send++
batch, ok := received[next_to_send]
for ok {
new_iter.pointer.channel <- batch
delete(received, next_to_send)
next_to_send++
batch, ok = received[next_to_send]
}
} else {
received[batch.order] = batch
}
}
new_iter.Done()
}()
return new_iter
}

34
pkg/obiseq/pool.go Normal file
View File

@@ -0,0 +1,34 @@
package obiseq
import (
"sync"
)
var __bioseq__pool__ = sync.Pool{
New: func() interface{} {
var bs __sequence__
bs.annotations = make(Annotation, 50)
return &bs
},
}
func MakeEmptyBioSequence() BioSequence {
bs := BioSequence{__bioseq__pool__.Get().(*__sequence__)}
bs.Reset()
return bs
}
func MakeBioSequence(id string,
sequence []byte,
definition string) BioSequence {
bs := MakeEmptyBioSequence()
bs.SetId(id)
bs.SetSequence(sequence)
bs.SetDefinition(definition)
return bs
}
func (sequence *BioSequence) Destroy() {
__bioseq__pool__.Put(sequence.sequence)
sequence.sequence = nil
}

26
pkg/obiseq/revcomp.go Normal file
View File

@@ -0,0 +1,26 @@
package obiseq
// ".ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
var __revcmp_dna__ = []byte(".TVGHEFCDIJMLKNOPQYSAABWXRZ#!][")
// Reverse complements a DNA sequence.
// If the inplace parametter is true, that operation is done in place.
func (sequence BioSequence) ReverseComplement(inplace bool) BioSequence {
if !inplace {
sequence = sequence.Copy()
}
s := sequence.sequence.sequence.Bytes()
for i, j := sequence.Length()-1, 0; i >= j; i-- {
s[j], s[i] = __revcmp_dna__[s[i]&31]|(s[i]&0x20),
__revcmp_dna__[s[j]&31]|(s[j]&0x20)
j++
}
sequence.sequence.id.WriteString("_revcomp")
return sequence
}

43
pkg/obiseq/subseq.go Normal file
View File

@@ -0,0 +1,43 @@
package obiseq
import (
"errors"
"fmt"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/goutils"
)
// Returns a sub sequence start from position 'from' included,
// to position 'to' excluded. Coordinates start at position 0.
func (sequence BioSequence) Subsequence(from, to int, circular bool) (BioSequence, error) {
if from >= to && !circular {
return NilBioSequence, errors.New("from greater than to")
}
if from < 0 || from >= sequence.Length() {
return NilBioSequence, errors.New("from out of bounds")
}
if to <= 0 || to > sequence.Length() {
return NilBioSequence, errors.New("to out of bounds")
}
var new_seq BioSequence
if from < to {
new_seq = MakeEmptyBioSequence()
new_seq.Write(sequence.Sequence()[from:to])
fmt.Fprintf(&new_seq.sequence.id, "%s_sub[%d..%d]", sequence.Id(), from+1, to)
new_seq.sequence.definition.Write(sequence.sequence.definition.Bytes())
} else {
new_seq, _ = sequence.Subsequence(from, sequence.Length(), false)
new_seq.Write(sequence.Sequence()[0:to])
}
if len(sequence.Annotations()) > 0 {
goutils.CopyMap(new_seq.Annotations(), sequence.Annotations())
}
return new_seq, nil
}

1
pkg/obiseq/types.go Normal file
View File

@@ -0,0 +1 @@
package obiseq

134
pkg/obiseq/workers.go Normal file
View File

@@ -0,0 +1,134 @@
package obiseq
import (
"log"
"time"
)
type SeqAnnotator func(BioSequence)
type SeqWorker func(BioSequence) BioSequence
type SeqSliceWorker func(BioSequenceSlice) BioSequenceSlice
func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
f := func(seq BioSequence) BioSequence {
function(seq)
return seq
}
return f
}
func (iterator IBioSequence) MakeIWorker(worker SeqWorker, sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
new_iter := MakeIBioSequence(buffsize)
new_iter.Add(1)
go func() {
new_iter.Wait()
close(new_iter.pointer.channel)
}()
go func() {
for iterator.Next() {
seq := iterator.Get()
seq = worker(seq)
new_iter.pointer.channel <- seq
}
new_iter.Done()
}()
return new_iter
}
func (iterator IBioSequenceBatch) MakeIWorker(worker SeqWorker, sizes ...int) IBioSequenceBatch {
nworkers := 4
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
nworkers = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(nworkers)
go func() {
new_iter.Wait()
for len(new_iter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(new_iter.pointer.channel)
log.Println("End of the batch workers")
}()
f := func(iterator IBioSequenceBatch) {
for iterator.Next() {
batch := iterator.Get()
for i, seq := range batch.slice {
batch.slice[i] = worker(seq)
}
new_iter.pointer.channel <- batch
}
new_iter.Done()
}
log.Println("Start of the batch workers")
for i := 0; i < nworkers; i++ {
go f(iterator.Split())
}
return new_iter
}
func (iterator IBioSequenceBatch) MakeISliceWorker(worker SeqSliceWorker, sizes ...int) IBioSequenceBatch {
nworkers := 4
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
nworkers = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
new_iter := MakeIBioSequenceBatch(buffsize)
new_iter.Add(nworkers)
go func() {
new_iter.Wait()
for len(new_iter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(new_iter.pointer.channel)
log.Println("End of the batch slice workers")
}()
f := func(iterator IBioSequenceBatch) {
for iterator.Next() {
batch := iterator.Get()
batch.slice = worker(batch.slice)
new_iter.pointer.channel <- batch
}
new_iter.Done()
}
log.Println("Start of the batch slice workers")
for i := 0; i < nworkers; i++ {
go f(iterator.Split())
}
return new_iter
}