mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 08:40:26 +00:00
First commit
This commit is contained in:
358
pkg/obiseq/batchiterator.go
Normal file
358
pkg/obiseq/batchiterator.go
Normal file
@@ -0,0 +1,358 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"log"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type BioSequenceBatch struct {
|
||||
slice BioSequenceSlice
|
||||
order int
|
||||
}
|
||||
|
||||
var NilBioSequenceBatch = BioSequenceBatch{nil, -1}
|
||||
|
||||
func MakeBioSequenceBatch(order int, sequences ...BioSequence) BioSequenceBatch {
|
||||
return BioSequenceBatch{
|
||||
slice: sequences,
|
||||
order: order,
|
||||
}
|
||||
}
|
||||
|
||||
func (batch BioSequenceBatch) Order() int {
|
||||
return batch.order
|
||||
}
|
||||
|
||||
func (batch BioSequenceBatch) Slice() BioSequenceSlice {
|
||||
return batch.slice
|
||||
}
|
||||
|
||||
func (batch BioSequenceBatch) Length() int {
|
||||
return len(batch.slice)
|
||||
}
|
||||
func (batch BioSequenceBatch) IsNil() bool {
|
||||
return batch.slice == nil
|
||||
}
|
||||
|
||||
// Structure implementing an iterator over bioseq.BioSequenceBatch
|
||||
// based on a channel.
|
||||
type __ibiosequencebatch__ struct {
|
||||
channel chan BioSequenceBatch
|
||||
current BioSequenceBatch
|
||||
all_done *sync.WaitGroup
|
||||
buffer_size int
|
||||
finished bool
|
||||
p_finished *bool
|
||||
}
|
||||
|
||||
type IBioSequenceBatch struct {
|
||||
pointer *__ibiosequencebatch__
|
||||
}
|
||||
|
||||
var NilIBioSequenceBatch = IBioSequenceBatch{pointer: nil}
|
||||
|
||||
func MakeIBioSequenceBatch(sizes ...int) IBioSequenceBatch {
|
||||
buffsize := 1
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
i := __ibiosequencebatch__{
|
||||
channel: make(chan BioSequenceBatch, buffsize),
|
||||
current: NilBioSequenceBatch,
|
||||
buffer_size: buffsize,
|
||||
finished: false,
|
||||
p_finished: nil}
|
||||
i.p_finished = &i.finished
|
||||
waiting := sync.WaitGroup{}
|
||||
i.all_done = &waiting
|
||||
ii := IBioSequenceBatch{&i}
|
||||
return ii
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Add(n int) {
|
||||
iterator.pointer.all_done.Add(n)
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Done() {
|
||||
iterator.pointer.all_done.Done()
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Wait() {
|
||||
iterator.pointer.all_done.Wait()
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Channel() chan BioSequenceBatch {
|
||||
return iterator.pointer.channel
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) IsNil() bool {
|
||||
return iterator.pointer == nil
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) BufferSize() int {
|
||||
return iterator.pointer.buffer_size
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Split() IBioSequenceBatch {
|
||||
i := __ibiosequencebatch__{
|
||||
channel: iterator.pointer.channel,
|
||||
current: NilBioSequenceBatch,
|
||||
all_done: iterator.pointer.all_done,
|
||||
buffer_size: iterator.pointer.buffer_size,
|
||||
finished: false,
|
||||
p_finished: iterator.pointer.p_finished}
|
||||
new_iter := IBioSequenceBatch{&i}
|
||||
return new_iter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Next() bool {
|
||||
if *(iterator.pointer.p_finished) {
|
||||
return false
|
||||
}
|
||||
next, ok := (<-iterator.pointer.channel)
|
||||
|
||||
if ok {
|
||||
iterator.pointer.current = next
|
||||
return true
|
||||
}
|
||||
|
||||
iterator.pointer.current = NilBioSequenceBatch
|
||||
*iterator.pointer.p_finished = true
|
||||
return false
|
||||
}
|
||||
|
||||
// The 'Get' method returns the instance of BioSequenceBatch
|
||||
// currently pointed by the iterator. You have to use the
|
||||
// 'Next' method to move to the next entry before calling
|
||||
// 'Get' to retreive the following instance.
|
||||
func (iterator IBioSequenceBatch) Get() BioSequenceBatch {
|
||||
return iterator.pointer.current
|
||||
}
|
||||
|
||||
// Finished returns 'true' value if no more data is available
|
||||
// from the iterator.
|
||||
func (iterator IBioSequenceBatch) Finished() bool {
|
||||
return *iterator.pointer.p_finished
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) IBioSequence(sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
new_iter := MakeIBioSequence(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
|
||||
for _, s := range batch.slice {
|
||||
new_iter.pointer.channel <- s
|
||||
}
|
||||
}
|
||||
new_iter.Done()
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) SortBatches(sizes ...int) IBioSequenceBatch {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
new_iter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.pointer.channel)
|
||||
}()
|
||||
|
||||
next_to_send := 0
|
||||
received := make(map[int]BioSequenceBatch)
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
if batch.order == next_to_send {
|
||||
new_iter.pointer.channel <- batch
|
||||
next_to_send++
|
||||
batch, ok := received[next_to_send]
|
||||
for ok {
|
||||
new_iter.pointer.channel <- batch
|
||||
delete(received, next_to_send)
|
||||
next_to_send++
|
||||
batch, ok = received[next_to_send]
|
||||
}
|
||||
} else {
|
||||
received[batch.order] = batch
|
||||
}
|
||||
}
|
||||
new_iter.Done()
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Concat(iterators ...IBioSequenceBatch) IBioSequenceBatch {
|
||||
|
||||
if len(iterators) == 0 {
|
||||
return iterator
|
||||
}
|
||||
|
||||
buffsize := iterator.BufferSize()
|
||||
new_iter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.Channel())
|
||||
}()
|
||||
|
||||
go func() {
|
||||
previous_max := 0
|
||||
max_order := 0
|
||||
|
||||
for iterator.Next() {
|
||||
s := iterator.Get()
|
||||
if s.order > max_order {
|
||||
max_order = s.order
|
||||
}
|
||||
new_iter.Channel() <- MakeBioSequenceBatch(s.order+previous_max, s.slice...)
|
||||
}
|
||||
|
||||
previous_max = max_order + 1
|
||||
for _, iter := range iterators {
|
||||
for iter.Next() {
|
||||
s := iter.Get()
|
||||
if (s.order + previous_max) > max_order {
|
||||
max_order = s.order + previous_max
|
||||
}
|
||||
|
||||
new_iter.Channel() <- MakeBioSequenceBatch(s.order+previous_max, s.slice...)
|
||||
}
|
||||
previous_max = max_order + 1
|
||||
}
|
||||
new_iter.Done()
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
}
|
||||
|
||||
// Redistributes sequences from a IBioSequenceBatch into a new
|
||||
// IBioSequenceBatch with every batches having the same size
|
||||
// indicated in parameter. Rebatching implies to sort the
|
||||
// source IBioSequenceBatch.
|
||||
func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBatch {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
new_iter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
order := 0
|
||||
iterator = iterator.SortBatches()
|
||||
buffer := make(BioSequenceSlice, 0, size)
|
||||
|
||||
for iterator.Next() {
|
||||
seqs := iterator.Get()
|
||||
for _, s := range seqs.slice {
|
||||
buffer = append(buffer, s)
|
||||
if len(buffer) == size {
|
||||
new_iter.Channel() <- MakeBioSequenceBatch(order, buffer...)
|
||||
order++
|
||||
buffer = make(BioSequenceSlice, 0, size)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(buffer) > 0 {
|
||||
new_iter.Channel() <- MakeBioSequenceBatch(order, buffer...)
|
||||
}
|
||||
|
||||
new_iter.Done()
|
||||
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Destroy() {
|
||||
|
||||
log.Println("Start recycling of Bioseq objects")
|
||||
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
(&seq).Destroy()
|
||||
}
|
||||
}
|
||||
log.Println("End of the recycling of Bioseq objects")
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch, sizes ...int) IPairedBioSequenceBatch {
|
||||
buffsize := iterator.BufferSize()
|
||||
batchsize := 5000
|
||||
|
||||
if len(sizes) > 0 {
|
||||
batchsize = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
iterator = iterator.Rebatch(batchsize)
|
||||
reverse = reverse.Rebatch(batchsize)
|
||||
|
||||
new_iter := MakeIPairedBioSequenceBatch(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.pointer.channel)
|
||||
log.Println("End of association of paired reads")
|
||||
}()
|
||||
|
||||
log.Println("Start association of paired reads")
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
if !reverse.Next() {
|
||||
log.Panicln("Etrange reverse pas prêt")
|
||||
}
|
||||
new_iter.Channel() <- MakePairedBioSequenceBatch(iterator.Get(),
|
||||
reverse.Get())
|
||||
}
|
||||
|
||||
new_iter.Done()
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
}
|
||||
185
pkg/obiseq/biosequence.go
Normal file
185
pkg/obiseq/biosequence.go
Normal file
@@ -0,0 +1,185 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/md5"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/oa2/pkg/goutils"
|
||||
)
|
||||
|
||||
type Quality []uint8
|
||||
|
||||
var __default_qualities__ = make(Quality, 0, 500)
|
||||
|
||||
func __make_default_qualities__(length int) Quality {
|
||||
cl := len(__default_qualities__)
|
||||
if cl < length {
|
||||
for i := cl; i <= length; i++ {
|
||||
__default_qualities__ = append(__default_qualities__, 40)
|
||||
}
|
||||
}
|
||||
return __default_qualities__[0:length]
|
||||
}
|
||||
|
||||
type Annotation map[string]interface{}
|
||||
|
||||
type __sequence__ struct {
|
||||
id bytes.Buffer
|
||||
definition bytes.Buffer
|
||||
sequence bytes.Buffer
|
||||
qualities bytes.Buffer
|
||||
feature bytes.Buffer
|
||||
annotations Annotation
|
||||
}
|
||||
|
||||
type BioSequence struct {
|
||||
sequence *__sequence__
|
||||
}
|
||||
|
||||
type BioSequenceSlice []BioSequence
|
||||
|
||||
var NilBioSequence = BioSequence{sequence: nil}
|
||||
|
||||
func (s BioSequence) IsNil() bool {
|
||||
return s.sequence == nil
|
||||
}
|
||||
|
||||
func (s BioSequence) Reset() {
|
||||
s.sequence.id.Reset()
|
||||
s.sequence.definition.Reset()
|
||||
s.sequence.sequence.Reset()
|
||||
s.sequence.qualities.Reset()
|
||||
s.sequence.feature.Reset()
|
||||
|
||||
for k := range s.sequence.annotations {
|
||||
delete(s.sequence.annotations, k)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func (s BioSequence) Copy() BioSequence {
|
||||
new_seq := MakeEmptyBioSequence()
|
||||
new_seq.sequence.id.Write(s.sequence.id.Bytes())
|
||||
new_seq.sequence.definition.Write(s.sequence.definition.Bytes())
|
||||
new_seq.sequence.sequence.Write(s.sequence.sequence.Bytes())
|
||||
new_seq.sequence.qualities.Write(s.sequence.qualities.Bytes())
|
||||
new_seq.sequence.feature.Write(s.sequence.feature.Bytes())
|
||||
|
||||
if len(s.sequence.annotations) > 0 {
|
||||
goutils.CopyMap(new_seq.sequence.annotations,
|
||||
s.sequence.annotations)
|
||||
}
|
||||
|
||||
return new_seq
|
||||
}
|
||||
|
||||
func (s BioSequence) Id() string {
|
||||
return s.sequence.id.String()
|
||||
}
|
||||
func (s BioSequence) Definition() string {
|
||||
return s.sequence.definition.String()
|
||||
}
|
||||
|
||||
func (s BioSequence) Sequence() []byte {
|
||||
return s.sequence.sequence.Bytes()
|
||||
}
|
||||
|
||||
func (s BioSequence) String() string {
|
||||
return s.sequence.sequence.String()
|
||||
}
|
||||
func (s BioSequence) Length() int {
|
||||
return s.sequence.sequence.Len()
|
||||
}
|
||||
|
||||
func (s BioSequence) HasQualities() bool {
|
||||
return s.sequence.qualities.Len() > 0
|
||||
}
|
||||
|
||||
func (s BioSequence) Qualities() Quality {
|
||||
if s.HasQualities() {
|
||||
return s.sequence.qualities.Bytes()
|
||||
} else {
|
||||
return __make_default_qualities__(s.sequence.sequence.Len())
|
||||
}
|
||||
}
|
||||
|
||||
func (s BioSequence) Features() string {
|
||||
return s.sequence.feature.String()
|
||||
}
|
||||
|
||||
func (s BioSequence) Annotations() Annotation {
|
||||
return s.sequence.annotations
|
||||
}
|
||||
|
||||
func (s BioSequence) MD5() [16]byte {
|
||||
return md5.Sum(s.sequence.sequence.Bytes())
|
||||
}
|
||||
|
||||
func (s BioSequence) Count() int {
|
||||
if s.sequence.annotations == nil {
|
||||
return 1
|
||||
}
|
||||
|
||||
if val, ok := (s.sequence.annotations)["count"]; ok {
|
||||
val, err := goutils.InterfaceToInt(val)
|
||||
if err == nil {
|
||||
return val
|
||||
}
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
func (s BioSequence) Taxid() int {
|
||||
if s.sequence.annotations == nil {
|
||||
return 1
|
||||
}
|
||||
|
||||
if val, ok := (s.sequence.annotations)["taxid"]; ok {
|
||||
val, err := goutils.InterfaceToInt(val)
|
||||
if err == nil {
|
||||
return val
|
||||
}
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
func (s BioSequence) SetId(id string) {
|
||||
s.sequence.id.Reset()
|
||||
s.sequence.id.WriteString(id)
|
||||
}
|
||||
|
||||
func (s BioSequence) SetDefinition(definition string) {
|
||||
s.sequence.definition.Reset()
|
||||
s.sequence.definition.WriteString(definition)
|
||||
}
|
||||
|
||||
func (s BioSequence) SetFeatures(feature string) {
|
||||
s.sequence.feature.Reset()
|
||||
s.sequence.feature.WriteString(feature)
|
||||
}
|
||||
|
||||
func (s BioSequence) SetSequence(sequence []byte) {
|
||||
s.sequence.sequence.Reset()
|
||||
s.sequence.sequence.Write(sequence)
|
||||
}
|
||||
|
||||
func (s BioSequence) SetQualities(qualities Quality) {
|
||||
s.sequence.qualities.Reset()
|
||||
s.sequence.qualities.Write(qualities)
|
||||
}
|
||||
|
||||
func (s BioSequence) Write(data []byte) (int, error) {
|
||||
return s.sequence.sequence.Write(data)
|
||||
}
|
||||
|
||||
func (s BioSequence) WriteString(data string) (int, error) {
|
||||
return s.sequence.sequence.WriteString(data)
|
||||
}
|
||||
|
||||
func (s BioSequence) WriteByte(data byte) error {
|
||||
return s.sequence.sequence.WriteByte(data)
|
||||
}
|
||||
|
||||
func (s BioSequence) WriteRune(data rune) (int, error) {
|
||||
return s.sequence.sequence.WriteRune(data)
|
||||
}
|
||||
326
pkg/obiseq/iterator.go
Normal file
326
pkg/obiseq/iterator.go
Normal file
@@ -0,0 +1,326 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Private structure implementing an iterator over
|
||||
// bioseq.BioSequence based on a channel.
|
||||
type __ibiosequence__ struct {
|
||||
channel chan BioSequence
|
||||
current BioSequence
|
||||
all_done *sync.WaitGroup
|
||||
buffer_size int
|
||||
finished bool
|
||||
p_finished *bool
|
||||
}
|
||||
|
||||
type IBioSequence struct {
|
||||
pointer *__ibiosequence__
|
||||
}
|
||||
|
||||
var NilIBioSequence = IBioSequence{pointer: nil}
|
||||
|
||||
func (iterator IBioSequence) IsNil() bool {
|
||||
return iterator.pointer == nil
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Add(n int) {
|
||||
iterator.pointer.all_done.Add(n)
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Done() {
|
||||
iterator.pointer.all_done.Done()
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Wait() {
|
||||
iterator.pointer.all_done.Wait()
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Channel() chan BioSequence {
|
||||
return iterator.pointer.channel
|
||||
}
|
||||
func (iterator IBioSequence) PChannel() *chan BioSequence {
|
||||
return &(iterator.pointer.channel)
|
||||
}
|
||||
|
||||
func MakeIBioSequence(sizes ...int) IBioSequence {
|
||||
buffsize := 1
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
i := __ibiosequence__{
|
||||
channel: make(chan BioSequence, buffsize),
|
||||
current: NilBioSequence,
|
||||
buffer_size: buffsize,
|
||||
finished: false,
|
||||
p_finished: nil}
|
||||
i.p_finished = &i.finished
|
||||
waiting := sync.WaitGroup{}
|
||||
i.all_done = &waiting
|
||||
ii := IBioSequence{&i}
|
||||
return ii
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Split() IBioSequence {
|
||||
i := __ibiosequence__{
|
||||
channel: iterator.pointer.channel,
|
||||
current: NilBioSequence,
|
||||
finished: false,
|
||||
all_done: iterator.pointer.all_done,
|
||||
buffer_size: iterator.pointer.buffer_size,
|
||||
p_finished: iterator.pointer.p_finished}
|
||||
new_iter := IBioSequence{&i}
|
||||
return new_iter
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Next() bool {
|
||||
if iterator.IsNil() || *(iterator.pointer.p_finished) {
|
||||
iterator.pointer.current = NilBioSequence
|
||||
return false
|
||||
}
|
||||
|
||||
next, ok := (<-iterator.pointer.channel)
|
||||
|
||||
if ok {
|
||||
iterator.pointer.current = next
|
||||
return true
|
||||
}
|
||||
|
||||
iterator.pointer.current = NilBioSequence
|
||||
*iterator.pointer.p_finished = true
|
||||
return false
|
||||
}
|
||||
|
||||
// The 'Get' method returns the instance of BioSequence
|
||||
// currently pointed by the iterator. You have to use the
|
||||
// 'Next' method to move to the next entry before calling
|
||||
// 'Get' to retreive the following instance.
|
||||
func (iterator IBioSequence) Get() BioSequence {
|
||||
return iterator.pointer.current
|
||||
}
|
||||
|
||||
// Finished returns 'true' value if no more data is available
|
||||
// from the iterator.
|
||||
func (iterator IBioSequence) Finished() bool {
|
||||
return *iterator.pointer.p_finished
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) BufferSize() int {
|
||||
return iterator.pointer.buffer_size
|
||||
}
|
||||
|
||||
// The IBioSequenceBatch converts a IBioSequence iterator
|
||||
// into an iterator oveer batches oof sequences. By default
|
||||
// the size of a batch is of 100 sequences and the iterator
|
||||
// implements a buffer equal to that of the source iterator.
|
||||
// These defaults can be overriden by specifying one or two
|
||||
// optional parametters at the method call. The first one
|
||||
// indicates the batch size. The second optional parametter
|
||||
// indicates the size of the buffer.
|
||||
func (iterator IBioSequence) IBioSequenceBatch(sizes ...int) IBioSequenceBatch {
|
||||
batchsize := 100
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
batchsize = sizes[0]
|
||||
}
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
new_iter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
for len(new_iter.Channel()) > 0 {
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
close(new_iter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for j := 0; !iterator.Finished(); j++ {
|
||||
batch := BioSequenceBatch{
|
||||
slice: make(BioSequenceSlice, 0, batchsize),
|
||||
order: j}
|
||||
for i := 0; i < batchsize && iterator.Next(); i++ {
|
||||
seq := iterator.Get()
|
||||
batch.slice = append(batch.slice, seq)
|
||||
}
|
||||
new_iter.pointer.channel <- batch
|
||||
}
|
||||
new_iter.Done()
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) IBioSequence(sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
new_iter := MakeIBioSequence(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
s := iterator.Get()
|
||||
new_iter.pointer.channel <- s
|
||||
}
|
||||
new_iter.Done()
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Skip(n int, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
new_iter := MakeIBioSequence(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for i := 0; iterator.Next(); i++ {
|
||||
if i >= n {
|
||||
s := iterator.Get()
|
||||
new_iter.pointer.channel <- s
|
||||
}
|
||||
}
|
||||
new_iter.Done()
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Head(n int, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
new_iter := MakeIBioSequence(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
not_done := true
|
||||
for i := 0; iterator.Next(); i++ {
|
||||
if i < n {
|
||||
s := iterator.Get()
|
||||
new_iter.pointer.channel <- s
|
||||
} else {
|
||||
if not_done {
|
||||
new_iter.Done()
|
||||
not_done = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
}
|
||||
|
||||
// The 'Tail' method discard every data from the source iterator
|
||||
// except the 'n' last ones.
|
||||
func (iterator IBioSequence) Tail(n int, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
new_iter := MakeIBioSequence(buffsize)
|
||||
buffseq := make(BioSequenceSlice, n)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
var i int
|
||||
for i = 0; iterator.Next(); i++ {
|
||||
buffseq[i%n] = iterator.Get()
|
||||
}
|
||||
if i > n {
|
||||
for j := 0; j < n; j++ {
|
||||
new_iter.Channel() <- buffseq[(i+j)%n]
|
||||
}
|
||||
|
||||
} else {
|
||||
for j := 0; j < i; j++ {
|
||||
new_iter.Channel() <- buffseq[j]
|
||||
}
|
||||
}
|
||||
new_iter.Done()
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Concat(iterators ...IBioSequence) IBioSequence {
|
||||
|
||||
if len(iterators) == 0 {
|
||||
return iterator
|
||||
}
|
||||
|
||||
buffsize := iterator.BufferSize()
|
||||
new_iter := MakeIBioSequence(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
s := iterator.Get()
|
||||
new_iter.pointer.channel <- s
|
||||
}
|
||||
|
||||
for _, iter := range iterators {
|
||||
for iter.Next() {
|
||||
s := iter.Get()
|
||||
new_iter.pointer.channel <- s
|
||||
}
|
||||
}
|
||||
new_iter.Done()
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
}
|
||||
19
pkg/obiseq/join.go
Normal file
19
pkg/obiseq/join.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package obiseq
|
||||
|
||||
import "git.metabarcoding.org/lecasofts/go/oa2/pkg/goutils"
|
||||
|
||||
func (sequence BioSequence) Join(seq2 BioSequence, copy_annot bool) (BioSequence, error) {
|
||||
|
||||
new_seq := MakeEmptyBioSequence()
|
||||
new_seq.SetId(sequence.Id())
|
||||
new_seq.SetDefinition(sequence.Definition())
|
||||
|
||||
new_seq.Write(sequence.Sequence())
|
||||
new_seq.Write(seq2.Sequence())
|
||||
|
||||
if copy_annot {
|
||||
goutils.CopyMap(new_seq.Annotations(), sequence.Annotations())
|
||||
}
|
||||
|
||||
return new_seq, nil
|
||||
}
|
||||
196
pkg/obiseq/pairedbatchiterator.go
Normal file
196
pkg/obiseq/pairedbatchiterator.go
Normal file
@@ -0,0 +1,196 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"log"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type PairedBioSequenceBatch struct {
|
||||
forward BioSequenceSlice
|
||||
reverse BioSequenceSlice
|
||||
order int
|
||||
}
|
||||
|
||||
var NilPairedBioSequenceBatch = PairedBioSequenceBatch{nil, nil, -1}
|
||||
|
||||
func MakePairedBioSequenceBatch(forward, reverse BioSequenceBatch) PairedBioSequenceBatch {
|
||||
if forward.order != reverse.order {
|
||||
log.Fatalf("Forward order : %d and reverse order : %d are not matching",
|
||||
forward.order, reverse.order)
|
||||
}
|
||||
|
||||
for i := range reverse.slice {
|
||||
reverse.slice[i].ReverseComplement(true)
|
||||
}
|
||||
|
||||
return PairedBioSequenceBatch{
|
||||
forward: forward.slice,
|
||||
reverse: reverse.slice,
|
||||
order: forward.order,
|
||||
}
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Order() int {
|
||||
return batch.order
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Length() int {
|
||||
return len(batch.forward)
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Forward() BioSequenceSlice {
|
||||
return batch.forward
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Reverse() BioSequenceSlice {
|
||||
return batch.reverse
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) IsNil() bool {
|
||||
return batch.forward == nil
|
||||
}
|
||||
|
||||
// Structure implementing an iterator over bioseq.BioSequenceBatch
|
||||
// based on a channel.
|
||||
type __ipairedbiosequencebatch__ struct {
|
||||
channel chan PairedBioSequenceBatch
|
||||
current PairedBioSequenceBatch
|
||||
all_done *sync.WaitGroup
|
||||
buffer_size int
|
||||
finished bool
|
||||
p_finished *bool
|
||||
}
|
||||
|
||||
type IPairedBioSequenceBatch struct {
|
||||
pointer *__ipairedbiosequencebatch__
|
||||
}
|
||||
|
||||
var NilIPairedBioSequenceBatch = IPairedBioSequenceBatch{pointer: nil}
|
||||
|
||||
func MakeIPairedBioSequenceBatch(sizes ...int) IPairedBioSequenceBatch {
|
||||
buffsize := 1
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
i := __ipairedbiosequencebatch__{
|
||||
channel: make(chan PairedBioSequenceBatch, buffsize),
|
||||
current: NilPairedBioSequenceBatch,
|
||||
buffer_size: buffsize,
|
||||
finished: false,
|
||||
p_finished: nil}
|
||||
i.p_finished = &i.finished
|
||||
waiting := sync.WaitGroup{}
|
||||
i.all_done = &waiting
|
||||
ii := IPairedBioSequenceBatch{&i}
|
||||
return ii
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Add(n int) {
|
||||
iterator.pointer.all_done.Add(n)
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Done() {
|
||||
iterator.pointer.all_done.Done()
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Wait() {
|
||||
iterator.pointer.all_done.Wait()
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Channel() chan PairedBioSequenceBatch {
|
||||
return iterator.pointer.channel
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) IsNil() bool {
|
||||
return iterator.pointer == nil
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) BufferSize() int {
|
||||
return iterator.pointer.buffer_size
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Split() IPairedBioSequenceBatch {
|
||||
i := __ipairedbiosequencebatch__{
|
||||
channel: iterator.pointer.channel,
|
||||
current: NilPairedBioSequenceBatch,
|
||||
all_done: iterator.pointer.all_done,
|
||||
buffer_size: iterator.pointer.buffer_size,
|
||||
finished: false,
|
||||
p_finished: iterator.pointer.p_finished}
|
||||
new_iter := IPairedBioSequenceBatch{&i}
|
||||
return new_iter
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Next() bool {
|
||||
if *(iterator.pointer.p_finished) {
|
||||
return false
|
||||
}
|
||||
next, ok := (<-iterator.pointer.channel)
|
||||
|
||||
if ok {
|
||||
iterator.pointer.current = next
|
||||
return true
|
||||
}
|
||||
|
||||
iterator.pointer.current = NilPairedBioSequenceBatch
|
||||
*iterator.pointer.p_finished = true
|
||||
return false
|
||||
}
|
||||
|
||||
// The 'Get' method returns the instance of BioSequenceBatch
|
||||
// currently pointed by the iterator. You have to use the
|
||||
// 'Next' method to move to the next entry before calling
|
||||
// 'Get' to retreive the following instance.
|
||||
func (iterator IPairedBioSequenceBatch) Get() PairedBioSequenceBatch {
|
||||
return iterator.pointer.current
|
||||
}
|
||||
|
||||
// Finished returns 'true' value if no more data is available
|
||||
// from the iterator.
|
||||
func (iterator IPairedBioSequenceBatch) Finished() bool {
|
||||
return *iterator.pointer.p_finished
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) SortBatches(sizes ...int) IPairedBioSequenceBatch {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
new_iter := MakeIPairedBioSequenceBatch(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.pointer.channel)
|
||||
}()
|
||||
|
||||
next_to_send := 0
|
||||
received := make(map[int]PairedBioSequenceBatch)
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
if batch.order == next_to_send {
|
||||
new_iter.pointer.channel <- batch
|
||||
next_to_send++
|
||||
batch, ok := received[next_to_send]
|
||||
for ok {
|
||||
new_iter.pointer.channel <- batch
|
||||
delete(received, next_to_send)
|
||||
next_to_send++
|
||||
batch, ok = received[next_to_send]
|
||||
}
|
||||
} else {
|
||||
received[batch.order] = batch
|
||||
}
|
||||
}
|
||||
new_iter.Done()
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
|
||||
}
|
||||
34
pkg/obiseq/pool.go
Normal file
34
pkg/obiseq/pool.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"sync"
|
||||
)
|
||||
|
||||
var __bioseq__pool__ = sync.Pool{
|
||||
New: func() interface{} {
|
||||
var bs __sequence__
|
||||
bs.annotations = make(Annotation, 50)
|
||||
return &bs
|
||||
},
|
||||
}
|
||||
|
||||
func MakeEmptyBioSequence() BioSequence {
|
||||
bs := BioSequence{__bioseq__pool__.Get().(*__sequence__)}
|
||||
bs.Reset()
|
||||
return bs
|
||||
}
|
||||
|
||||
func MakeBioSequence(id string,
|
||||
sequence []byte,
|
||||
definition string) BioSequence {
|
||||
bs := MakeEmptyBioSequence()
|
||||
bs.SetId(id)
|
||||
bs.SetSequence(sequence)
|
||||
bs.SetDefinition(definition)
|
||||
return bs
|
||||
}
|
||||
|
||||
func (sequence *BioSequence) Destroy() {
|
||||
__bioseq__pool__.Put(sequence.sequence)
|
||||
sequence.sequence = nil
|
||||
}
|
||||
26
pkg/obiseq/revcomp.go
Normal file
26
pkg/obiseq/revcomp.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package obiseq
|
||||
|
||||
// ".ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
|
||||
var __revcmp_dna__ = []byte(".TVGHEFCDIJMLKNOPQYSAABWXRZ#!][")
|
||||
|
||||
// Reverse complements a DNA sequence.
|
||||
// If the inplace parametter is true, that operation is done in place.
|
||||
func (sequence BioSequence) ReverseComplement(inplace bool) BioSequence {
|
||||
|
||||
if !inplace {
|
||||
sequence = sequence.Copy()
|
||||
}
|
||||
|
||||
s := sequence.sequence.sequence.Bytes()
|
||||
|
||||
for i, j := sequence.Length()-1, 0; i >= j; i-- {
|
||||
|
||||
s[j], s[i] = __revcmp_dna__[s[i]&31]|(s[i]&0x20),
|
||||
__revcmp_dna__[s[j]&31]|(s[j]&0x20)
|
||||
j++
|
||||
}
|
||||
|
||||
sequence.sequence.id.WriteString("_revcomp")
|
||||
|
||||
return sequence
|
||||
}
|
||||
43
pkg/obiseq/subseq.go
Normal file
43
pkg/obiseq/subseq.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/oa2/pkg/goutils"
|
||||
)
|
||||
|
||||
// Returns a sub sequence start from position 'from' included,
|
||||
// to position 'to' excluded. Coordinates start at position 0.
|
||||
func (sequence BioSequence) Subsequence(from, to int, circular bool) (BioSequence, error) {
|
||||
|
||||
if from >= to && !circular {
|
||||
return NilBioSequence, errors.New("from greater than to")
|
||||
}
|
||||
|
||||
if from < 0 || from >= sequence.Length() {
|
||||
return NilBioSequence, errors.New("from out of bounds")
|
||||
}
|
||||
|
||||
if to <= 0 || to > sequence.Length() {
|
||||
return NilBioSequence, errors.New("to out of bounds")
|
||||
}
|
||||
|
||||
var new_seq BioSequence
|
||||
|
||||
if from < to {
|
||||
new_seq = MakeEmptyBioSequence()
|
||||
new_seq.Write(sequence.Sequence()[from:to])
|
||||
fmt.Fprintf(&new_seq.sequence.id, "%s_sub[%d..%d]", sequence.Id(), from+1, to)
|
||||
new_seq.sequence.definition.Write(sequence.sequence.definition.Bytes())
|
||||
} else {
|
||||
new_seq, _ = sequence.Subsequence(from, sequence.Length(), false)
|
||||
new_seq.Write(sequence.Sequence()[0:to])
|
||||
}
|
||||
|
||||
if len(sequence.Annotations()) > 0 {
|
||||
goutils.CopyMap(new_seq.Annotations(), sequence.Annotations())
|
||||
}
|
||||
|
||||
return new_seq, nil
|
||||
}
|
||||
1
pkg/obiseq/types.go
Normal file
1
pkg/obiseq/types.go
Normal file
@@ -0,0 +1 @@
|
||||
package obiseq
|
||||
134
pkg/obiseq/workers.go
Normal file
134
pkg/obiseq/workers.go
Normal file
@@ -0,0 +1,134 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"log"
|
||||
"time"
|
||||
)
|
||||
|
||||
type SeqAnnotator func(BioSequence)
|
||||
|
||||
type SeqWorker func(BioSequence) BioSequence
|
||||
type SeqSliceWorker func(BioSequenceSlice) BioSequenceSlice
|
||||
|
||||
func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
|
||||
f := func(seq BioSequence) BioSequence {
|
||||
function(seq)
|
||||
return seq
|
||||
}
|
||||
return f
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) MakeIWorker(worker SeqWorker, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
new_iter := MakeIBioSequence(buffsize)
|
||||
|
||||
new_iter.Add(1)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
close(new_iter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
seq := iterator.Get()
|
||||
seq = worker(seq)
|
||||
new_iter.pointer.channel <- seq
|
||||
}
|
||||
new_iter.Done()
|
||||
}()
|
||||
|
||||
return new_iter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) MakeIWorker(worker SeqWorker, sizes ...int) IBioSequenceBatch {
|
||||
nworkers := 4
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
new_iter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
new_iter.Add(nworkers)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
for len(new_iter.Channel()) > 0 {
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
close(new_iter.pointer.channel)
|
||||
log.Println("End of the batch workers")
|
||||
|
||||
}()
|
||||
|
||||
f := func(iterator IBioSequenceBatch) {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
for i, seq := range batch.slice {
|
||||
batch.slice[i] = worker(seq)
|
||||
}
|
||||
new_iter.pointer.channel <- batch
|
||||
}
|
||||
new_iter.Done()
|
||||
}
|
||||
|
||||
log.Println("Start of the batch workers")
|
||||
for i := 0; i < nworkers; i++ {
|
||||
go f(iterator.Split())
|
||||
}
|
||||
|
||||
return new_iter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) MakeISliceWorker(worker SeqSliceWorker, sizes ...int) IBioSequenceBatch {
|
||||
nworkers := 4
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
new_iter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
new_iter.Add(nworkers)
|
||||
|
||||
go func() {
|
||||
new_iter.Wait()
|
||||
for len(new_iter.Channel()) > 0 {
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
close(new_iter.pointer.channel)
|
||||
log.Println("End of the batch slice workers")
|
||||
}()
|
||||
|
||||
f := func(iterator IBioSequenceBatch) {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
batch.slice = worker(batch.slice)
|
||||
new_iter.pointer.channel <- batch
|
||||
}
|
||||
new_iter.Done()
|
||||
}
|
||||
|
||||
log.Println("Start of the batch slice workers")
|
||||
for i := 0; i < nworkers; i++ {
|
||||
go f(iterator.Split())
|
||||
}
|
||||
|
||||
return new_iter
|
||||
}
|
||||
Reference in New Issue
Block a user