Files
obitools4/pkg/obiseq/batchiterator.go.old

534 lines
11 KiB
Go
Raw Normal View History

2022-01-13 23:27:39 +01:00
package obiseq
import (
"fmt"
2022-01-13 23:27:39 +01:00
"log"
"sync"
"sync/atomic"
"time"
"github.com/tevino/abool/v2"
2022-01-13 23:27:39 +01:00
)
type BioSequenceBatch struct {
slice BioSequenceSlice
order int
}
var NilBioSequenceBatch = BioSequenceBatch{nil, -1}
func MakeBioSequenceBatch(order int, sequences BioSequenceSlice) BioSequenceBatch {
2022-01-13 23:27:39 +01:00
return BioSequenceBatch{
slice: sequences,
order: order,
}
}
func (batch BioSequenceBatch) Order() int {
return batch.order
}
2022-02-18 09:59:47 +01:00
func (batch BioSequenceBatch) Reorder(newOrder int) BioSequenceBatch {
batch.order = newOrder
return batch
}
2022-01-13 23:27:39 +01:00
func (batch BioSequenceBatch) Slice() BioSequenceSlice {
return batch.slice
}
func (batch BioSequenceBatch) Length() int {
return len(batch.slice)
}
func (batch BioSequenceBatch) NotEmpty() bool {
return batch.slice.NotEmpty()
}
func (batch BioSequenceBatch) Pop0() *BioSequence {
return batch.slice.Pop0()
}
2022-01-13 23:27:39 +01:00
func (batch BioSequenceBatch) IsNil() bool {
return batch.slice == nil
}
2022-02-18 22:53:09 +01:00
func (batch BioSequenceBatch) Recycle() {
batch.slice.Recycle()
batch.slice = nil
}
2022-01-13 23:27:39 +01:00
// Structure implementing an iterator over bioseq.BioSequenceBatch
// based on a channel.
type _IBioSequenceBatch struct {
channel chan BioSequenceBatch
current BioSequenceBatch
pushBack *abool.AtomicBool
all_done *sync.WaitGroup
lock *sync.RWMutex
buffer_size int32
batch_size int32
sequence_format string
finished *abool.AtomicBool
2022-01-13 23:27:39 +01:00
}
type IBioSequenceBatch struct {
pointer *_IBioSequenceBatch
2022-01-13 23:27:39 +01:00
}
var NilIBioSequenceBatch = IBioSequenceBatch{pointer: nil}
func MakeIBioSequenceBatch(sizes ...int) IBioSequenceBatch {
buffsize := int32(1)
2022-01-13 23:27:39 +01:00
if len(sizes) > 0 {
buffsize = int32(sizes[0])
2022-01-13 23:27:39 +01:00
}
i := _IBioSequenceBatch{
channel: make(chan BioSequenceBatch, buffsize),
current: NilBioSequenceBatch,
pushBack: abool.New(),
buffer_size: buffsize,
batch_size: -1,
sequence_format: "",
finished: abool.New(),
}
2022-01-13 23:27:39 +01:00
waiting := sync.WaitGroup{}
i.all_done = &waiting
lock := sync.RWMutex{}
i.lock = &lock
2022-01-13 23:27:39 +01:00
ii := IBioSequenceBatch{&i}
return ii
}
func (iterator IBioSequenceBatch) Add(n int) {
iterator.pointer.all_done.Add(n)
}
func (iterator IBioSequenceBatch) Done() {
iterator.pointer.all_done.Done()
}
func (iterator IBioSequenceBatch) Unlock() {
iterator.pointer.lock.Unlock()
}
func (iterator IBioSequenceBatch) Lock() {
iterator.pointer.lock.Lock()
}
func (iterator IBioSequenceBatch) RLock() {
iterator.pointer.lock.RLock()
}
func (iterator IBioSequenceBatch) RUnlock() {
iterator.pointer.lock.RUnlock()
}
2022-01-13 23:27:39 +01:00
func (iterator IBioSequenceBatch) Wait() {
iterator.pointer.all_done.Wait()
}
func (iterator IBioSequenceBatch) Channel() chan BioSequenceBatch {
return iterator.pointer.channel
}
func (iterator IBioSequenceBatch) IsNil() bool {
return iterator.pointer == nil
}
func (iterator IBioSequenceBatch) BufferSize() int {
return int(atomic.LoadInt32(&iterator.pointer.buffer_size))
}
func (iterator IBioSequenceBatch) BatchSize() int {
return int(atomic.LoadInt32(&iterator.pointer.batch_size))
}
func (iterator IBioSequenceBatch) SetBatchSize(size int) error {
if size >= 0 {
atomic.StoreInt32(&iterator.pointer.batch_size, int32(size))
return nil
}
return fmt.Errorf("size (%d) cannot be negative", size)
2022-01-13 23:27:39 +01:00
}
func (iterator IBioSequenceBatch) Split() IBioSequenceBatch {
iterator.pointer.lock.RLock()
defer iterator.pointer.lock.RUnlock()
i := _IBioSequenceBatch{
channel: iterator.pointer.channel,
current: NilBioSequenceBatch,
pushBack: abool.New(),
all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size,
batch_size: iterator.pointer.batch_size,
sequence_format: iterator.pointer.sequence_format,
finished: iterator.pointer.finished}
lock := sync.RWMutex{}
i.lock = &lock
2022-01-14 17:32:12 +01:00
newIter := IBioSequenceBatch{&i}
return newIter
2022-01-13 23:27:39 +01:00
}
func (iterator IBioSequenceBatch) Next() bool {
if iterator.pointer.pushBack.IsSet() {
iterator.pointer.pushBack.UnSet()
return true
}
if iterator.pointer.finished.IsSet() {
return false
}
2022-01-13 23:27:39 +01:00
next, ok := (<-iterator.pointer.channel)
if ok {
iterator.pointer.current = next
return true
}
iterator.pointer.current = NilBioSequenceBatch
iterator.pointer.finished.Set()
2022-01-13 23:27:39 +01:00
return false
}
func (iterator IBioSequenceBatch) PushBack() {
if !iterator.pointer.current.IsNil() {
iterator.pointer.pushBack.Set()
}
}
2022-01-13 23:27:39 +01:00
// The 'Get' method returns the instance of BioSequenceBatch
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator IBioSequenceBatch) Get() BioSequenceBatch {
return iterator.pointer.current
}
func (iterator IBioSequenceBatch) Push(batch BioSequenceBatch) {
if batch.IsNil() {
log.Panicln("An Nil batch is pushed on the channel")
}
if batch.Length() == 0 {
log.Panicln("An empty batch is pushed on the channel")
}
iterator.pointer.channel <- batch
}
func (iterator IBioSequenceBatch) Close() {
close(iterator.pointer.channel)
}
func (iterator IBioSequenceBatch) WaitAndClose() {
iterator.Wait()
for len(iterator.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
iterator.Close()
}
2022-01-13 23:27:39 +01:00
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator IBioSequenceBatch) Finished() bool {
return iterator.pointer.finished.IsSet()
2022-01-13 23:27:39 +01:00
}
func (iterator IBioSequenceBatch) IBioSequence(sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
2022-01-14 17:32:12 +01:00
newIter := MakeIBioSequence(buffsize)
2022-01-13 23:27:39 +01:00
2022-01-14 17:32:12 +01:00
newIter.Add(1)
2022-01-13 23:27:39 +01:00
go func() {
2022-01-14 17:32:12 +01:00
newIter.Wait()
close(newIter.pointer.channel)
2022-01-13 23:27:39 +01:00
}()
go func() {
for iterator.Next() {
batch := iterator.Get()
for batch.NotEmpty() {
newIter.pointer.channel <- batch.Pop0()
2022-01-13 23:27:39 +01:00
}
batch.Recycle()
2022-01-13 23:27:39 +01:00
}
2022-01-14 17:32:12 +01:00
newIter.Done()
2022-01-13 23:27:39 +01:00
}()
2022-01-14 17:32:12 +01:00
return newIter
2022-01-13 23:27:39 +01:00
}
func (iterator IBioSequenceBatch) SortBatches(sizes ...int) IBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
2022-01-14 17:32:12 +01:00
newIter := MakeIBioSequenceBatch(buffsize)
2022-01-13 23:27:39 +01:00
2022-01-14 17:32:12 +01:00
newIter.Add(1)
2022-01-13 23:27:39 +01:00
go func() {
2022-01-14 17:32:12 +01:00
newIter.Wait()
close(newIter.pointer.channel)
2022-01-13 23:27:39 +01:00
}()
next_to_send := 0
received := make(map[int]BioSequenceBatch)
go func() {
for iterator.Next() {
batch := iterator.Get()
if batch.order == next_to_send {
2022-01-14 17:32:12 +01:00
newIter.pointer.channel <- batch
2022-01-13 23:27:39 +01:00
next_to_send++
batch, ok := received[next_to_send]
for ok {
2022-01-14 17:32:12 +01:00
newIter.pointer.channel <- batch
2022-01-13 23:27:39 +01:00
delete(received, next_to_send)
next_to_send++
batch, ok = received[next_to_send]
}
} else {
received[batch.order] = batch
}
}
2022-01-14 17:32:12 +01:00
newIter.Done()
2022-01-13 23:27:39 +01:00
}()
2022-01-14 17:32:12 +01:00
return newIter
2022-01-13 23:27:39 +01:00
}
func (iterator IBioSequenceBatch) Concat(iterators ...IBioSequenceBatch) IBioSequenceBatch {
if len(iterators) == 0 {
return iterator
}
buffsize := iterator.BufferSize()
2022-01-14 17:32:12 +01:00
newIter := MakeIBioSequenceBatch(buffsize)
2022-01-13 23:27:39 +01:00
2022-01-14 17:32:12 +01:00
newIter.Add(1)
2022-01-13 23:27:39 +01:00
go func() {
2022-01-14 17:32:12 +01:00
newIter.Wait()
close(newIter.Channel())
2022-01-13 23:27:39 +01:00
}()
go func() {
previous_max := 0
max_order := 0
for iterator.Next() {
s := iterator.Get()
if s.order > max_order {
max_order = s.order
}
newIter.Push(s.Reorder(s.order + previous_max))
2022-01-13 23:27:39 +01:00
}
previous_max = max_order + 1
for _, iter := range iterators {
for iter.Next() {
s := iter.Get()
if (s.order + previous_max) > max_order {
max_order = s.order + previous_max
}
newIter.Push(s.Reorder(s.order + previous_max))
2022-01-13 23:27:39 +01:00
}
previous_max = max_order + 1
}
2022-01-14 17:32:12 +01:00
newIter.Done()
2022-01-13 23:27:39 +01:00
}()
2022-01-14 17:32:12 +01:00
return newIter
2022-01-13 23:27:39 +01:00
}
// Redistributes sequences from a IBioSequenceBatch into a new
// IBioSequenceBatch with every batches having the same size
// indicated in parameter. Rebatching implies to sort the
// source IBioSequenceBatch.
func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
2022-01-14 17:32:12 +01:00
newIter := MakeIBioSequenceBatch(buffsize)
2022-01-13 23:27:39 +01:00
2022-01-14 17:32:12 +01:00
newIter.Add(1)
2022-01-13 23:27:39 +01:00
go func() {
2022-01-14 17:32:12 +01:00
newIter.Wait()
close(newIter.pointer.channel)
2022-01-13 23:27:39 +01:00
}()
go func() {
order := 0
iterator = iterator.SortBatches()
buffer := MakeBioSequenceSlice()
2022-01-13 23:27:39 +01:00
for iterator.Next() {
seqs := iterator.Get()
for _, s := range seqs.slice {
buffer = append(buffer, s)
if len(buffer) == size {
newIter.Push(MakeBioSequenceBatch(order, buffer))
2022-01-13 23:27:39 +01:00
order++
buffer = MakeBioSequenceSlice()
2022-01-13 23:27:39 +01:00
}
}
2022-02-18 22:53:09 +01:00
seqs.Recycle()
2022-01-13 23:27:39 +01:00
}
if len(buffer) > 0 {
newIter.Push(MakeBioSequenceBatch(order, buffer))
2022-01-13 23:27:39 +01:00
}
2022-01-14 17:32:12 +01:00
newIter.Done()
2022-01-13 23:27:39 +01:00
}()
2022-01-14 17:32:12 +01:00
return newIter
2022-01-13 23:27:39 +01:00
}
func (iterator IBioSequenceBatch) Recycle() {
2022-01-13 23:27:39 +01:00
log.Println("Start recycling of Bioseq objects")
recycled := 0
2022-01-13 23:27:39 +01:00
for iterator.Next() {
2022-02-18 09:59:47 +01:00
// iterator.Get()
2022-01-13 23:27:39 +01:00
batch := iterator.Get()
for _, seq := range batch.Slice() {
seq.Recycle()
recycled++
2022-01-13 23:27:39 +01:00
}
batch.Recycle()
2022-01-13 23:27:39 +01:00
}
log.Printf("End of the recycling of %d Bioseq objects", recycled)
2022-01-13 23:27:39 +01:00
}
func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch, sizes ...int) IPairedBioSequenceBatch {
buffsize := iterator.BufferSize()
batchsize := 5000
if len(sizes) > 0 {
batchsize = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
iterator = iterator.Rebatch(batchsize)
reverse = reverse.Rebatch(batchsize)
2022-01-14 17:32:12 +01:00
newIter := MakeIPairedBioSequenceBatch(buffsize)
2022-01-13 23:27:39 +01:00
2022-01-14 17:32:12 +01:00
newIter.Add(1)
2022-01-13 23:27:39 +01:00
go func() {
2022-01-14 17:32:12 +01:00
newIter.Wait()
close(newIter.pointer.channel)
2022-01-13 23:27:39 +01:00
log.Println("End of association of paired reads")
}()
log.Println("Start association of paired reads")
go func() {
for iterator.Next() {
if !reverse.Next() {
log.Panicln("Etrange reverse pas prêt")
}
2022-01-14 17:32:12 +01:00
newIter.Channel() <- MakePairedBioSequenceBatch(iterator.Get(),
2022-01-13 23:27:39 +01:00
reverse.Get())
}
2022-01-14 17:32:12 +01:00
newIter.Done()
2022-01-13 23:27:39 +01:00
}()
2022-01-14 17:32:12 +01:00
return newIter
2022-01-13 23:27:39 +01:00
}
2022-02-01 23:25:19 +01:00
func (iterator IBioSequenceBatch) DivideOn(predicate SequencePredicate,
size int, sizes ...int) (IBioSequenceBatch, IBioSequenceBatch) {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
2022-02-01 23:25:19 +01:00
}
trueIter := MakeIBioSequenceBatch(buffsize)
falseIter := MakeIBioSequenceBatch(buffsize)
trueIter.Add(1)
falseIter.Add(1)
go func() {
trueIter.WaitAndClose()
falseIter.WaitAndClose()
2022-02-01 23:25:19 +01:00
}()
go func() {
trueOrder := 0
falseOrder := 0
iterator = iterator.SortBatches()
trueSlice := MakeBioSequenceSlice()
falseSlice := MakeBioSequenceSlice()
2022-02-01 23:25:19 +01:00
for iterator.Next() {
seqs := iterator.Get()
for _, s := range seqs.slice {
if predicate(s) {
trueSlice = append(trueSlice, s)
} else {
falseSlice = append(falseSlice, s)
}
if len(trueSlice) == size {
trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice))
2022-02-01 23:25:19 +01:00
trueOrder++
trueSlice = MakeBioSequenceSlice()
2022-02-01 23:25:19 +01:00
}
if len(falseSlice) == size {
falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice))
2022-02-01 23:25:19 +01:00
falseOrder++
falseSlice = MakeBioSequenceSlice()
2022-02-01 23:25:19 +01:00
}
}
2022-02-18 22:53:09 +01:00
seqs.Recycle()
2022-02-01 23:25:19 +01:00
}
if len(trueSlice) > 0 {
trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice))
2022-02-01 23:25:19 +01:00
}
if len(falseSlice) > 0 {
falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice))
2022-02-01 23:25:19 +01:00
}
trueIter.Done()
falseIter.Done()
}()
return trueIter, falseIter
}