mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
change the model for representing paired reads and extend its usage to other commands
This commit is contained in:
@ -46,6 +46,7 @@ type _IBioSequence struct {
|
||||
batch_size int32
|
||||
sequence_format string
|
||||
finished *abool.AtomicBool
|
||||
paired bool
|
||||
}
|
||||
|
||||
type IBioSequence struct {
|
||||
@ -73,6 +74,7 @@ func MakeIBioSequence(sizes ...int) IBioSequence {
|
||||
batch_size: -1,
|
||||
sequence_format: "",
|
||||
finished: abool.New(),
|
||||
paired: false,
|
||||
}
|
||||
|
||||
waiting := sync.WaitGroup{}
|
||||
@ -199,6 +201,11 @@ func (iterator IBioSequence) Split() IBioSequence {
|
||||
i.lock = &lock
|
||||
|
||||
newIter := IBioSequence{&i}
|
||||
|
||||
if iterator.IsPaired() {
|
||||
newIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
@ -270,6 +277,7 @@ func (iterator IBioSequence) Finished() bool {
|
||||
return iterator.pointer.finished.IsSet()
|
||||
}
|
||||
|
||||
// Sorting the batches of sequences.
|
||||
func (iterator IBioSequence) SortBatches(sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
@ -311,6 +319,10 @@ func (iterator IBioSequence) SortBatches(sizes ...int) IBioSequence {
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
if iterator.IsPaired() {
|
||||
newIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return newIter
|
||||
|
||||
}
|
||||
@ -321,6 +333,11 @@ func (iterator IBioSequence) Concat(iterators ...IBioSequence) IBioSequence {
|
||||
return iterator
|
||||
}
|
||||
|
||||
allPaired := iterator.IsPaired()
|
||||
for _, i := range iterators {
|
||||
allPaired = allPaired && i.IsPaired()
|
||||
}
|
||||
|
||||
buffsize := iterator.BufferSize()
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
|
||||
@ -357,6 +374,10 @@ func (iterator IBioSequence) Concat(iterators ...IBioSequence) IBioSequence {
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
if allPaired {
|
||||
newIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
@ -368,6 +389,12 @@ func (iterator IBioSequence) Pool(iterators ...IBioSequence) IBioSequence {
|
||||
return iterator
|
||||
}
|
||||
|
||||
allPaired := iterator.IsPaired()
|
||||
|
||||
for _, i := range iterators {
|
||||
allPaired = allPaired && i.IsPaired()
|
||||
}
|
||||
|
||||
nextCounter := goutils.AtomicCounter()
|
||||
buffsize := iterator.BufferSize()
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
@ -392,6 +419,10 @@ func (iterator IBioSequence) Pool(iterators ...IBioSequence) IBioSequence {
|
||||
go ff(i)
|
||||
}
|
||||
|
||||
if allPaired {
|
||||
newIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
@ -441,6 +472,10 @@ func (iterator IBioSequence) Rebatch(size int, sizes ...int) IBioSequence {
|
||||
|
||||
}()
|
||||
|
||||
if iterator.IsPaired() {
|
||||
newIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
@ -492,47 +527,6 @@ func (iterator IBioSequence) Count(recycle bool) (int, int, int) {
|
||||
return variants, reads, nucleotides
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) PairWith(reverse IBioSequence,
|
||||
sizes ...int) IPairedBioSequenceBatch {
|
||||
buffsize := iterator.BufferSize()
|
||||
batchsize := 5000
|
||||
|
||||
if len(sizes) > 0 {
|
||||
batchsize = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
iterator = iterator.Rebatch(batchsize)
|
||||
reverse = reverse.Rebatch(batchsize)
|
||||
|
||||
newIter := MakeIPairedBioSequenceBatch(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
log.Println("End of association of paired reads")
|
||||
}()
|
||||
|
||||
log.Println("Start association of paired reads")
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
if !reverse.Next() {
|
||||
log.Panicln("Etrange reverse pas prêt")
|
||||
}
|
||||
newIter.Channel() <- MakePairedBioSequenceBatch(iterator.Get(),
|
||||
reverse.Get())
|
||||
}
|
||||
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
// A function that takes a predicate and returns two IBioSequenceBatch iterators.
|
||||
// Sequences extracted from the input iterator are distributed among both the
|
||||
// iterator following the predicate value.
|
||||
@ -599,6 +593,10 @@ func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate,
|
||||
falseIter.Done()
|
||||
}()
|
||||
|
||||
if iterator.IsPaired() {
|
||||
trueIter.MarkAsPaired()
|
||||
falseIter.MarkAsPaired()
|
||||
}
|
||||
return trueIter, falseIter
|
||||
}
|
||||
|
||||
@ -654,6 +652,71 @@ func (iterator IBioSequence) FilterOn(predicate obiseq.SequencePredicate,
|
||||
|
||||
go ff(iterator)
|
||||
|
||||
if iterator.IsPaired() {
|
||||
trueIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return trueIter.Rebatch(size)
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
|
||||
size int, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
nworkers := 4
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
trueIter := MakeIBioSequence(buffsize)
|
||||
|
||||
trueIter.Add(nworkers)
|
||||
|
||||
go func() {
|
||||
trueIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
ff := func(iterator IBioSequence) {
|
||||
// iterator = iterator.SortBatches()
|
||||
|
||||
for iterator.Next() {
|
||||
seqs := iterator.Get()
|
||||
slice := seqs.slice
|
||||
j := 0
|
||||
for _, s := range slice {
|
||||
good := predicate(s)
|
||||
if s.IsPaired() {
|
||||
good = good && predicate(s.PairedWith())
|
||||
}
|
||||
if good {
|
||||
slice[j] = s
|
||||
j++
|
||||
} else {
|
||||
s.Recycle()
|
||||
}
|
||||
}
|
||||
|
||||
seqs.slice = slice[:j]
|
||||
trueIter.pointer.channel <- seqs
|
||||
}
|
||||
|
||||
trueIter.Done()
|
||||
}
|
||||
|
||||
for i := 1; i < nworkers; i++ {
|
||||
go ff(iterator.Split())
|
||||
}
|
||||
|
||||
go ff(iterator)
|
||||
|
||||
if iterator.IsPaired() {
|
||||
trueIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return trueIter.Rebatch(size)
|
||||
}
|
||||
|
||||
@ -673,13 +736,14 @@ func (iterator IBioSequence) Load() obiseq.BioSequenceSlice {
|
||||
|
||||
// It takes a slice of BioSequence objects, and returns an iterator that will return batches of
|
||||
// BioSequence objects
|
||||
|
||||
func IBatchOver(data obiseq.BioSequenceSlice,
|
||||
size int, sizes ...int) IBioSequence {
|
||||
|
||||
buffsize := 0
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[1]
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
@ -706,5 +770,8 @@ func IBatchOver(data obiseq.BioSequenceSlice,
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
if data.IsPaired() {
|
||||
newIter.MarkAsPaired()
|
||||
}
|
||||
return newIter
|
||||
}
|
||||
|
95
pkg/obiiter/paired.go
Normal file
95
pkg/obiiter/paired.go
Normal file
@ -0,0 +1,95 @@
|
||||
package obiiter
|
||||
|
||||
import (
|
||||
"log"
|
||||
)
|
||||
|
||||
func (b BioSequenceBatch) IsPaired() bool {
|
||||
return b.slice.IsPaired()
|
||||
}
|
||||
|
||||
func (b BioSequenceBatch) PairedWith() BioSequenceBatch {
|
||||
return MakeBioSequenceBatch(b.order,
|
||||
*b.slice.PairedWith())
|
||||
|
||||
}
|
||||
|
||||
func (b *BioSequenceBatch) PairTo(p *BioSequenceBatch) {
|
||||
|
||||
if b.order != p.order {
|
||||
log.Fatalf("both batches are not synchronized : (%d,%d)",
|
||||
b.order, p.order,
|
||||
)
|
||||
}
|
||||
|
||||
b.slice.PairTo(&p.slice)
|
||||
|
||||
}
|
||||
|
||||
func (b *BioSequenceBatch) UnPair() {
|
||||
b.slice.UnPair()
|
||||
}
|
||||
|
||||
func (iter IBioSequence) MarkAsPaired() {
|
||||
iter.pointer.paired = true
|
||||
}
|
||||
|
||||
func (iter IBioSequence) PairTo(p IBioSequence) IBioSequence {
|
||||
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
iter = iter.SortBatches()
|
||||
p = p.SortBatches()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
go func() {
|
||||
|
||||
for iter.Next() {
|
||||
p.Next()
|
||||
batch := iter.Get()
|
||||
pbatch := p.Get()
|
||||
batch.PairTo(&pbatch)
|
||||
newIter.Push(batch)
|
||||
}
|
||||
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
newIter.MarkAsPaired()
|
||||
return newIter
|
||||
|
||||
}
|
||||
|
||||
func (iter IBioSequence) PairedWith() IBioSequence {
|
||||
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
go func() {
|
||||
|
||||
for iter.Next() {
|
||||
batch := iter.Get().PairedWith()
|
||||
newIter.Push(batch)
|
||||
}
|
||||
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
newIter.MarkAsPaired()
|
||||
return newIter
|
||||
|
||||
}
|
||||
|
||||
func (iter IBioSequence) IsPaired() bool {
|
||||
return iter.pointer.paired
|
||||
}
|
@ -1,239 +0,0 @@
|
||||
package obiiter
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
type PairedBioSequenceBatch struct {
|
||||
forward obiseq.BioSequenceSlice
|
||||
reverse obiseq.BioSequenceSlice
|
||||
order int
|
||||
}
|
||||
|
||||
var NilPairedBioSequenceBatch = PairedBioSequenceBatch{nil, nil, -1}
|
||||
|
||||
func MakePairedBioSequenceBatch(forward, reverse BioSequenceBatch) PairedBioSequenceBatch {
|
||||
if forward.order != reverse.order {
|
||||
log.Fatalf("Forward order : %d and reverse order : %d are not matching",
|
||||
forward.order, reverse.order)
|
||||
}
|
||||
|
||||
for i := range reverse.slice {
|
||||
reverse.slice[i].ReverseComplement(true)
|
||||
}
|
||||
|
||||
return PairedBioSequenceBatch{
|
||||
forward: forward.slice,
|
||||
reverse: reverse.slice,
|
||||
order: forward.order,
|
||||
}
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Order() int {
|
||||
return batch.order
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Reorder(newOrder int) PairedBioSequenceBatch {
|
||||
batch.order = newOrder
|
||||
return batch
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Len() int {
|
||||
return len(batch.forward)
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Forward() obiseq.BioSequenceSlice {
|
||||
return batch.forward
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Reverse() obiseq.BioSequenceSlice {
|
||||
return batch.reverse
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) IsNil() bool {
|
||||
return batch.forward == nil
|
||||
}
|
||||
|
||||
// Structure implementing an iterator over bioseq.BioSequenceBatch
|
||||
// based on a channel.
|
||||
type __ipairedbiosequencebatch__ struct {
|
||||
channel chan PairedBioSequenceBatch
|
||||
current PairedBioSequenceBatch
|
||||
pushBack bool
|
||||
all_done *sync.WaitGroup
|
||||
buffer_size int
|
||||
finished bool
|
||||
p_finished *bool
|
||||
}
|
||||
|
||||
type IPairedBioSequenceBatch struct {
|
||||
pointer *__ipairedbiosequencebatch__
|
||||
}
|
||||
|
||||
var NilIPairedBioSequenceBatch = IPairedBioSequenceBatch{pointer: nil}
|
||||
|
||||
func MakeIPairedBioSequenceBatch(sizes ...int) IPairedBioSequenceBatch {
|
||||
buffsize := 1
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
i := __ipairedbiosequencebatch__{
|
||||
channel: make(chan PairedBioSequenceBatch, buffsize),
|
||||
current: NilPairedBioSequenceBatch,
|
||||
pushBack: false,
|
||||
buffer_size: buffsize,
|
||||
finished: false,
|
||||
p_finished: nil,
|
||||
}
|
||||
|
||||
i.p_finished = &i.finished
|
||||
waiting := sync.WaitGroup{}
|
||||
i.all_done = &waiting
|
||||
ii := IPairedBioSequenceBatch{&i}
|
||||
|
||||
RegisterAPipe()
|
||||
return ii
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Add(n int) {
|
||||
iterator.pointer.all_done.Add(n)
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Done() {
|
||||
iterator.pointer.all_done.Done()
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Wait() {
|
||||
iterator.pointer.all_done.Wait()
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Channel() chan PairedBioSequenceBatch {
|
||||
return iterator.pointer.channel
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Close() {
|
||||
close(iterator.pointer.channel)
|
||||
UnregisterPipe()
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) WaitAndClose() {
|
||||
iterator.Wait()
|
||||
|
||||
for len(iterator.Channel()) > 0 {
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
|
||||
iterator.Close()
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) IsNil() bool {
|
||||
return iterator.pointer == nil
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) BufferSize() int {
|
||||
return iterator.pointer.buffer_size
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Split() IPairedBioSequenceBatch {
|
||||
i := __ipairedbiosequencebatch__{
|
||||
channel: iterator.pointer.channel,
|
||||
current: NilPairedBioSequenceBatch,
|
||||
pushBack: false,
|
||||
all_done: iterator.pointer.all_done,
|
||||
buffer_size: iterator.pointer.buffer_size,
|
||||
finished: false,
|
||||
p_finished: iterator.pointer.p_finished}
|
||||
newIter := IPairedBioSequenceBatch{&i}
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Next() bool {
|
||||
if *(iterator.pointer.p_finished) {
|
||||
return false
|
||||
}
|
||||
|
||||
if iterator.pointer.pushBack {
|
||||
iterator.pointer.pushBack = false
|
||||
return true
|
||||
}
|
||||
|
||||
next, ok := (<-iterator.pointer.channel)
|
||||
|
||||
if ok {
|
||||
iterator.pointer.current = next
|
||||
return true
|
||||
}
|
||||
|
||||
iterator.pointer.current = NilPairedBioSequenceBatch
|
||||
*iterator.pointer.p_finished = true
|
||||
return false
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) PushBack() {
|
||||
if !iterator.pointer.current.IsNil() {
|
||||
iterator.pointer.pushBack = true
|
||||
}
|
||||
}
|
||||
|
||||
// The 'Get' method returns the instance of BioSequenceBatch
|
||||
// currently pointed by the iterator. You have to use the
|
||||
// 'Next' method to move to the next entry before calling
|
||||
// 'Get' to retreive the following instance.
|
||||
func (iterator IPairedBioSequenceBatch) Get() PairedBioSequenceBatch {
|
||||
return iterator.pointer.current
|
||||
}
|
||||
|
||||
// Finished returns 'true' value if no more data is available
|
||||
// from the iterator.
|
||||
func (iterator IPairedBioSequenceBatch) Finished() bool {
|
||||
return *iterator.pointer.p_finished
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) SortBatches(sizes ...int) IPairedBioSequenceBatch {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIPairedBioSequenceBatch(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
}()
|
||||
|
||||
next_to_send := 0
|
||||
received := make(map[int]PairedBioSequenceBatch)
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
if batch.order == next_to_send {
|
||||
newIter.pointer.channel <- batch
|
||||
next_to_send++
|
||||
batch, ok := received[next_to_send]
|
||||
for ok {
|
||||
newIter.pointer.channel <- batch
|
||||
delete(received, next_to_send)
|
||||
next_to_send++
|
||||
batch, ok = received[next_to_send]
|
||||
}
|
||||
} else {
|
||||
received[batch.order] = batch
|
||||
}
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
|
||||
}
|
@ -40,5 +40,10 @@ func (input IBioSequence) CopyTee() (IBioSequence, IBioSequence) {
|
||||
}
|
||||
}()
|
||||
|
||||
if input.IsPaired() {
|
||||
first.MarkAsPaired()
|
||||
second.MarkAsPaired()
|
||||
}
|
||||
|
||||
return first, second
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ package obiiter
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/schollz/progressbar/v3"
|
||||
)
|
||||
@ -45,18 +46,29 @@ func (iterator IBioSequence) Speed(message ...string) IBioSequence {
|
||||
bar := progressbar.NewOptions(-1, pbopt...)
|
||||
|
||||
go func() {
|
||||
c := 0
|
||||
start := time.Now()
|
||||
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
l := batch.Len()
|
||||
c += batch.Len()
|
||||
newIter.Push(batch)
|
||||
bar.Add(l)
|
||||
elapsed := time.Since(start)
|
||||
if elapsed > (time.Millisecond * 100) {
|
||||
bar.Add(c)
|
||||
c = 0
|
||||
start = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Fprintln(os.Stderr)
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
if iterator.IsPaired() {
|
||||
newIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
|
@ -54,6 +54,10 @@ func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int)
|
||||
}
|
||||
go f(iterator)
|
||||
|
||||
if iterator.IsPaired() {
|
||||
newIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
@ -99,6 +103,10 @@ func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePre
|
||||
}
|
||||
go f(iterator)
|
||||
|
||||
if iterator.IsPaired() {
|
||||
newIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
@ -138,6 +146,10 @@ func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, size
|
||||
}
|
||||
go f(iterator)
|
||||
|
||||
if iterator.IsPaired() {
|
||||
newIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user