mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 08:40:26 +00:00
Some code refactoring, a new version of obiuniq more efficient in memory and a first make file allowing to build obitools
This commit is contained in:
@@ -1,115 +0,0 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type IDistribute struct {
|
||||
outputs map[int]IBioSequenceBatch
|
||||
news chan int
|
||||
classifier *BioSequenceClassifier
|
||||
lock *sync.Mutex
|
||||
}
|
||||
|
||||
func (dist *IDistribute) Outputs(key int) (IBioSequenceBatch, error) {
|
||||
dist.lock.Lock()
|
||||
iter, ok := dist.outputs[key]
|
||||
dist.lock.Unlock()
|
||||
|
||||
if !ok {
|
||||
return NilIBioSequenceBatch, fmt.Errorf("code %d unknown", key)
|
||||
}
|
||||
|
||||
return iter, nil
|
||||
}
|
||||
|
||||
func (dist *IDistribute) News() chan int {
|
||||
return dist.news
|
||||
}
|
||||
|
||||
func (dist *IDistribute) Classifier() *BioSequenceClassifier {
|
||||
return dist.classifier
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Distribute(class *BioSequenceClassifier, sizes ...int) IDistribute {
|
||||
batchsize := 5000
|
||||
buffsize := 2
|
||||
|
||||
outputs := make(map[int]IBioSequenceBatch, 100)
|
||||
slices := make(map[int]*BioSequenceSlice, 100)
|
||||
orders := make(map[int]int, 100)
|
||||
news := make(chan int)
|
||||
|
||||
if len(sizes) > 0 {
|
||||
batchsize = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
jobDone := sync.WaitGroup{}
|
||||
lock := sync.Mutex{}
|
||||
|
||||
jobDone.Add(1)
|
||||
|
||||
go func() {
|
||||
jobDone.Wait()
|
||||
close(news)
|
||||
for _, i := range outputs {
|
||||
i.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
iterator = iterator.SortBatches()
|
||||
|
||||
for iterator.Next() {
|
||||
seqs := iterator.Get()
|
||||
for _, s := range seqs.Slice() {
|
||||
key := class.Code(s)
|
||||
slice, ok := slices[key]
|
||||
|
||||
if !ok {
|
||||
s := MakeBioSequenceSlice()
|
||||
slice = &s
|
||||
slices[key] = slice
|
||||
orders[key] = 0
|
||||
|
||||
lock.Lock()
|
||||
outputs[key] = MakeIBioSequenceBatch(buffsize)
|
||||
lock.Unlock()
|
||||
|
||||
news <- key
|
||||
}
|
||||
|
||||
*slice = append(*slice, s)
|
||||
|
||||
if len(*slice) == batchsize {
|
||||
outputs[key].Push(MakeBioSequenceBatch(orders[key], *slice))
|
||||
orders[key]++
|
||||
s := MakeBioSequenceSlice()
|
||||
slices[key] = &s
|
||||
}
|
||||
}
|
||||
seqs.Recycle()
|
||||
}
|
||||
|
||||
for key, slice := range slices {
|
||||
if len(*slice) > 0 {
|
||||
outputs[key].Push(MakeBioSequenceBatch(orders[key], *slice))
|
||||
}
|
||||
}
|
||||
|
||||
jobDone.Done()
|
||||
|
||||
}()
|
||||
|
||||
return IDistribute{
|
||||
outputs,
|
||||
news,
|
||||
class,
|
||||
&lock}
|
||||
|
||||
}
|
||||
@@ -1,340 +0,0 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"sync"
|
||||
)
|
||||
|
||||
// Private structure implementing an iterator over
|
||||
// bioseq.BioSequence based on a channel.
|
||||
type __ibiosequence__ struct {
|
||||
channel chan *BioSequence
|
||||
current *BioSequence
|
||||
pushBack bool
|
||||
all_done *sync.WaitGroup
|
||||
buffer_size int
|
||||
finished bool
|
||||
pFinished *bool
|
||||
}
|
||||
|
||||
type IBioSequence struct {
|
||||
pointer *__ibiosequence__
|
||||
}
|
||||
|
||||
var NilIBioSequence = IBioSequence{pointer: nil}
|
||||
|
||||
func (iterator IBioSequence) IsNil() bool {
|
||||
return iterator.pointer == nil
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Add(n int) {
|
||||
iterator.pointer.all_done.Add(n)
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Done() {
|
||||
iterator.pointer.all_done.Done()
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Wait() {
|
||||
iterator.pointer.all_done.Wait()
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Channel() chan *BioSequence {
|
||||
return iterator.pointer.channel
|
||||
}
|
||||
func (iterator IBioSequence) PChannel() *chan *BioSequence {
|
||||
return &(iterator.pointer.channel)
|
||||
}
|
||||
|
||||
func MakeIBioSequence(sizes ...int) IBioSequence {
|
||||
buffsize := 1
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
i := __ibiosequence__{
|
||||
channel: make(chan *BioSequence, buffsize),
|
||||
current: nil,
|
||||
pushBack: false,
|
||||
buffer_size: buffsize,
|
||||
finished: false,
|
||||
pFinished: nil,
|
||||
}
|
||||
|
||||
i.pFinished = &i.finished
|
||||
waiting := sync.WaitGroup{}
|
||||
i.all_done = &waiting
|
||||
ii := IBioSequence{&i}
|
||||
return ii
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Split() IBioSequence {
|
||||
|
||||
i := __ibiosequence__{
|
||||
channel: iterator.pointer.channel,
|
||||
current: nil,
|
||||
pushBack: false,
|
||||
finished: false,
|
||||
all_done: iterator.pointer.all_done,
|
||||
buffer_size: iterator.pointer.buffer_size,
|
||||
pFinished: iterator.pointer.pFinished,
|
||||
}
|
||||
|
||||
newIter := IBioSequence{&i}
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Next() bool {
|
||||
if iterator.IsNil() || *(iterator.pointer.pFinished) {
|
||||
iterator.pointer.current = nil
|
||||
return false
|
||||
}
|
||||
|
||||
if iterator.pointer.pushBack {
|
||||
iterator.pointer.pushBack = false
|
||||
return true
|
||||
}
|
||||
|
||||
next, ok := (<-iterator.pointer.channel)
|
||||
|
||||
if ok {
|
||||
iterator.pointer.current = next
|
||||
return true
|
||||
}
|
||||
|
||||
iterator.pointer.current = nil
|
||||
*iterator.pointer.pFinished = true
|
||||
return false
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) PushBack() {
|
||||
if !(iterator.pointer.current == nil) {
|
||||
iterator.pointer.pushBack = true
|
||||
}
|
||||
}
|
||||
|
||||
// The 'Get' method returns the instance of BioSequence
|
||||
// currently pointed by the iterator. You have to use the
|
||||
// 'Next' method to move to the next entry before calling
|
||||
// 'Get' to retreive the following instance.
|
||||
func (iterator IBioSequence) Get() *BioSequence {
|
||||
return iterator.pointer.current
|
||||
}
|
||||
|
||||
// Finished returns 'true' value if no more data is available
|
||||
// from the iterator.
|
||||
func (iterator IBioSequence) Finished() bool {
|
||||
return *iterator.pointer.pFinished
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) BufferSize() int {
|
||||
return iterator.pointer.buffer_size
|
||||
}
|
||||
|
||||
// The IBioSequenceBatch converts a IBioSequence iterator
|
||||
// into an iterator oveer batches oof sequences. By default
|
||||
// the size of a batch is of 100 sequences and the iterator
|
||||
// implements a buffer equal to that of the source iterator.
|
||||
// These defaults can be overriden by specifying one or two
|
||||
// optional parametters at the method call. The first one
|
||||
// indicates the batch size. The second optional parametter
|
||||
// indicates the size of the buffer.
|
||||
func (iterator IBioSequence) IBioSequenceBatch(sizes ...int) IBioSequenceBatch {
|
||||
batchsize := 100
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
batchsize = sizes[0]
|
||||
}
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for j := 0; !iterator.Finished(); j++ {
|
||||
batch := BioSequenceBatch{
|
||||
slice: MakeBioSequenceSlice(),
|
||||
order: j}
|
||||
for i := 0; i < batchsize && iterator.Next(); i++ {
|
||||
seq := iterator.Get()
|
||||
batch.slice = append(batch.slice, seq)
|
||||
}
|
||||
newIter.pointer.channel <- batch
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) IBioSequence(sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
s := iterator.Get()
|
||||
newIter.pointer.channel <- s
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Skip(n int, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for i := 0; iterator.Next(); i++ {
|
||||
if i >= n {
|
||||
s := iterator.Get()
|
||||
newIter.pointer.channel <- s
|
||||
}
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Head(n int, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
not_done := true
|
||||
for i := 0; iterator.Next(); i++ {
|
||||
if i < n {
|
||||
s := iterator.Get()
|
||||
newIter.pointer.channel <- s
|
||||
} else {
|
||||
if not_done {
|
||||
newIter.Done()
|
||||
not_done = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
// The 'Tail' method discard every data from the source iterator
|
||||
// except the 'n' last ones.
|
||||
func (iterator IBioSequence) Tail(n int, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
buffseq := MakeBioSequenceSlice()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
var i int
|
||||
for i = 0; iterator.Next(); i++ {
|
||||
buffseq[i%n] = iterator.Get()
|
||||
}
|
||||
if i > n {
|
||||
for j := 0; j < n; j++ {
|
||||
newIter.Channel() <- buffseq[(i+j)%n]
|
||||
}
|
||||
|
||||
} else {
|
||||
for j := 0; j < i; j++ {
|
||||
newIter.Channel() <- buffseq[j]
|
||||
}
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) Concat(iterators ...IBioSequence) IBioSequence {
|
||||
|
||||
if len(iterators) == 0 {
|
||||
return iterator
|
||||
}
|
||||
|
||||
buffsize := iterator.BufferSize()
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
s := iterator.Get()
|
||||
newIter.pointer.channel <- s
|
||||
}
|
||||
|
||||
for _, iter := range iterators {
|
||||
for iter.Next() {
|
||||
s := iter.Get()
|
||||
newIter.pointer.channel <- s
|
||||
}
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
@@ -165,41 +165,3 @@ func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequenc
|
||||
return seq
|
||||
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) IMergeSequenceBatch(na string, statsOn []string, sizes ...int) IBioSequenceBatch {
|
||||
batchsize := 100
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
batchsize = sizes[0]
|
||||
}
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for j := 0; !iterator.Finished(); j++ {
|
||||
batch := BioSequenceBatch{
|
||||
slice: MakeBioSequenceSlice(),
|
||||
order: j}
|
||||
for i := 0; i < batchsize && iterator.Next(); i++ {
|
||||
seqs := iterator.Get()
|
||||
batch.slice = append(batch.slice, seqs.slice.Merge(na, statsOn))
|
||||
}
|
||||
if batch.Length() > 0 {
|
||||
newIter.Push(batch)
|
||||
}
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
|
||||
"github.com/PaesslerAG/gval"
|
||||
)
|
||||
|
||||
type SequencePredicate func(*BioSequence) bool
|
||||
|
||||
func (predicate1 SequencePredicate) And(predicate2 SequencePredicate) SequencePredicate {
|
||||
@@ -73,3 +80,33 @@ func IsShorterOrEqualTo(length int) SequencePredicate {
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func ExrpessionPredicat(expression string) SequencePredicate {
|
||||
|
||||
exp, err := gval.Full().NewEvaluable(expression)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Error in the expression : %s", expression)
|
||||
}
|
||||
|
||||
f := func(sequence *BioSequence) bool {
|
||||
value, err := exp.EvalBool(context.Background(),
|
||||
map[string]interface{}{
|
||||
"annot": sequence.Annotations(),
|
||||
"count": sequence.Count(),
|
||||
"length": sequence.Length(),
|
||||
"sequence": sequence,
|
||||
},
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Expression '%s' cannot be evaluated on sequence %s",
|
||||
expression,
|
||||
sequence.Id())
|
||||
}
|
||||
|
||||
return value
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"github.com/schollz/progressbar/v3"
|
||||
)
|
||||
|
||||
func (iterator IBioSequenceBatch) Speed() IBioSequenceBatch {
|
||||
newIter := MakeIBioSequenceBatch()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
bar := progressbar.NewOptions(
|
||||
-1,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("[Sequence Processing]"))
|
||||
|
||||
go func() {
|
||||
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
l := batch.Length()
|
||||
newIter.Push(batch)
|
||||
bar.Add(l)
|
||||
}
|
||||
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
2
pkg/obiseq/worker.go
Normal file
2
pkg/obiseq/worker.go
Normal file
@@ -0,0 +1,2 @@
|
||||
package obiseq
|
||||
|
||||
@@ -1,127 +0,0 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"log"
|
||||
)
|
||||
|
||||
type SeqAnnotator func(*BioSequence)
|
||||
|
||||
type SeqWorker func(*BioSequence) *BioSequence
|
||||
type SeqSliceWorker func(BioSequenceSlice) BioSequenceSlice
|
||||
|
||||
func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
|
||||
f := func(seq *BioSequence) *BioSequence {
|
||||
function(seq)
|
||||
return seq
|
||||
}
|
||||
return f
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) MakeIWorker(worker SeqWorker, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
seq := iterator.Get()
|
||||
seq = worker(seq)
|
||||
newIter.pointer.channel <- seq
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) MakeIWorker(worker SeqWorker, sizes ...int) IBioSequenceBatch {
|
||||
nworkers := 4
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
newIter.Add(nworkers)
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
log.Println("End of the batch workers")
|
||||
|
||||
}()
|
||||
|
||||
f := func(iterator IBioSequenceBatch) {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
for i, seq := range batch.slice {
|
||||
batch.slice[i] = worker(seq)
|
||||
}
|
||||
newIter.Push(batch)
|
||||
}
|
||||
newIter.Done()
|
||||
}
|
||||
|
||||
log.Println("Start of the batch workers")
|
||||
for i := 0; i < nworkers-1; i++ {
|
||||
go f(iterator.Split())
|
||||
}
|
||||
go f(iterator)
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) MakeISliceWorker(worker SeqSliceWorker, sizes ...int) IBioSequenceBatch {
|
||||
nworkers := 4
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
newIter.Add(nworkers)
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
log.Println("End of the batch slice workers")
|
||||
}()
|
||||
|
||||
f := func(iterator IBioSequenceBatch) {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
batch.slice = worker(batch.slice)
|
||||
newIter.pointer.channel <- batch
|
||||
}
|
||||
newIter.Done()
|
||||
}
|
||||
|
||||
log.Println("Start of the batch slice workers")
|
||||
for i := 0; i < nworkers-1; i++ {
|
||||
go f(iterator.Split())
|
||||
}
|
||||
go f(iterator)
|
||||
|
||||
return newIter
|
||||
}
|
||||
Reference in New Issue
Block a user