change the model for representing paired reads and extend its usage to other commands

This commit is contained in:
2023-02-23 23:35:58 +01:00
parent ebb05fcdf7
commit 072b85e155
23 changed files with 598 additions and 338 deletions

View File

@ -133,5 +133,5 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
opts.BufferSize(), opts.BufferSize(),
) )
return iMerged.Speed("Variants identified"), nil return iMerged, nil
} }

View File

@ -156,7 +156,6 @@ func WriteFastaToFile(iterator obiiter.IBioSequence,
filename string, filename string,
options ...WithOption) (obiiter.IBioSequence, error) { options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options) opt := MakeOptions(options)
flags := os.O_WRONLY | os.O_CREATE flags := os.O_WRONLY | os.O_CREATE
@ -165,7 +164,6 @@ func WriteFastaToFile(iterator obiiter.IBioSequence,
} }
file, err := os.OpenFile(filename, flags, 0660) file, err := os.OpenFile(filename, flags, 0660)
if err != nil { if err != nil {
log.Fatalf("open file error: %v", err) log.Fatalf("open file error: %v", err)
return obiiter.NilIBioSequence, err return obiiter.NilIBioSequence, err
@ -173,5 +171,18 @@ func WriteFastaToFile(iterator obiiter.IBioSequence,
options = append(options, OptionCloseFile()) options = append(options, OptionCloseFile())
return WriteFasta(iterator, file, options...) iterator, err = WriteFasta(iterator, file, options...)
if opt.HaveToSavePaired() {
var revfile *os.File
revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiiter.NilIBioSequence, err
}
iterator, err = WriteFasta(iterator.PairedWith(), revfile, options...)
}
return iterator, err
} }

View File

@ -160,5 +160,18 @@ func WriteFastqToFile(iterator obiiter.IBioSequence,
options = append(options, OptionCloseFile()) options = append(options, OptionCloseFile())
return WriteFastq(iterator, file, options...) iterator, err = WriteFastq(iterator, file, options...)
if opt.HaveToSavePaired() {
var revfile *os.File
revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiiter.NilIBioSequence, err
}
iterator, err = WriteFastq(iterator.PairedWith(), revfile, options...)
}
return iterator, err
} }

View File

@ -15,6 +15,11 @@ type __options__ struct {
closefile bool closefile bool
appendfile bool appendfile bool
compressed bool compressed bool
csv_ids bool
cvs_sequence bool
csv_definition bool
csv_separator string
paired_filename string
} }
type Options struct { type Options struct {
@ -35,6 +40,11 @@ func MakeOptions(setters []WithOption) Options {
closefile: false, closefile: false,
appendfile: false, appendfile: false,
compressed: false, compressed: false,
csv_ids: true,
csv_definition: false,
cvs_sequence: true,
csv_separator: ",",
paired_filename: "",
} }
opt := Options{&o} opt := Options{&o}
@ -86,6 +96,18 @@ func (opt Options) CompressedFile() bool {
return opt.pointer.compressed return opt.pointer.compressed
} }
func (opt Options) CSVIds() bool {
return opt.pointer.csv_ids
}
func (opt Options) HaveToSavePaired() bool {
return opt.pointer.paired_filename != ""
}
func (opt Options) PairedFileName() string {
return opt.pointer.paired_filename
}
func OptionsBufferSize(size int) WithOption { func OptionsBufferSize(size int) WithOption {
f := WithOption(func(opt Options) { f := WithOption(func(opt Options) {
opt.pointer.buffer_size = size opt.pointer.buffer_size = size
@ -216,3 +238,12 @@ func OptionsWithoutProgressBar() WithOption {
return f return f
} }
func WritePairedReadsTo(filename string) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.paired_filename = filename
})
return f
}

View File

@ -69,5 +69,19 @@ func WriteSequencesToFile(iterator obiiter.IBioSequence,
} }
options = append(options, OptionCloseFile()) options = append(options, OptionCloseFile())
return WriteSequence(iterator, file, options...)
iterator, err = WriteSequence(iterator, file, options...)
if opt.HaveToSavePaired() {
var revfile *os.File
revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiiter.NilIBioSequence, err
}
iterator, err = WriteSequence(iterator.PairedWith(), revfile, options...)
}
return iterator, err
} }

View File

@ -46,6 +46,7 @@ type _IBioSequence struct {
batch_size int32 batch_size int32
sequence_format string sequence_format string
finished *abool.AtomicBool finished *abool.AtomicBool
paired bool
} }
type IBioSequence struct { type IBioSequence struct {
@ -73,6 +74,7 @@ func MakeIBioSequence(sizes ...int) IBioSequence {
batch_size: -1, batch_size: -1,
sequence_format: "", sequence_format: "",
finished: abool.New(), finished: abool.New(),
paired: false,
} }
waiting := sync.WaitGroup{} waiting := sync.WaitGroup{}
@ -199,6 +201,11 @@ func (iterator IBioSequence) Split() IBioSequence {
i.lock = &lock i.lock = &lock
newIter := IBioSequence{&i} newIter := IBioSequence{&i}
if iterator.IsPaired() {
newIter.MarkAsPaired()
}
return newIter return newIter
} }
@ -270,6 +277,7 @@ func (iterator IBioSequence) Finished() bool {
return iterator.pointer.finished.IsSet() return iterator.pointer.finished.IsSet()
} }
// Sorting the batches of sequences.
func (iterator IBioSequence) SortBatches(sizes ...int) IBioSequence { func (iterator IBioSequence) SortBatches(sizes ...int) IBioSequence {
buffsize := iterator.BufferSize() buffsize := iterator.BufferSize()
@ -311,6 +319,10 @@ func (iterator IBioSequence) SortBatches(sizes ...int) IBioSequence {
newIter.Done() newIter.Done()
}() }()
if iterator.IsPaired() {
newIter.MarkAsPaired()
}
return newIter return newIter
} }
@ -321,6 +333,11 @@ func (iterator IBioSequence) Concat(iterators ...IBioSequence) IBioSequence {
return iterator return iterator
} }
allPaired := iterator.IsPaired()
for _, i := range iterators {
allPaired = allPaired && i.IsPaired()
}
buffsize := iterator.BufferSize() buffsize := iterator.BufferSize()
newIter := MakeIBioSequence(buffsize) newIter := MakeIBioSequence(buffsize)
@ -357,6 +374,10 @@ func (iterator IBioSequence) Concat(iterators ...IBioSequence) IBioSequence {
newIter.Done() newIter.Done()
}() }()
if allPaired {
newIter.MarkAsPaired()
}
return newIter return newIter
} }
@ -368,6 +389,12 @@ func (iterator IBioSequence) Pool(iterators ...IBioSequence) IBioSequence {
return iterator return iterator
} }
allPaired := iterator.IsPaired()
for _, i := range iterators {
allPaired = allPaired && i.IsPaired()
}
nextCounter := goutils.AtomicCounter() nextCounter := goutils.AtomicCounter()
buffsize := iterator.BufferSize() buffsize := iterator.BufferSize()
newIter := MakeIBioSequence(buffsize) newIter := MakeIBioSequence(buffsize)
@ -392,6 +419,10 @@ func (iterator IBioSequence) Pool(iterators ...IBioSequence) IBioSequence {
go ff(i) go ff(i)
} }
if allPaired {
newIter.MarkAsPaired()
}
return newIter return newIter
} }
@ -441,6 +472,10 @@ func (iterator IBioSequence) Rebatch(size int, sizes ...int) IBioSequence {
}() }()
if iterator.IsPaired() {
newIter.MarkAsPaired()
}
return newIter return newIter
} }
@ -492,47 +527,6 @@ func (iterator IBioSequence) Count(recycle bool) (int, int, int) {
return variants, reads, nucleotides return variants, reads, nucleotides
} }
func (iterator IBioSequence) PairWith(reverse IBioSequence,
sizes ...int) IPairedBioSequenceBatch {
buffsize := iterator.BufferSize()
batchsize := 5000
if len(sizes) > 0 {
batchsize = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
iterator = iterator.Rebatch(batchsize)
reverse = reverse.Rebatch(batchsize)
newIter := MakeIPairedBioSequenceBatch(buffsize)
newIter.Add(1)
go func() {
newIter.WaitAndClose()
log.Println("End of association of paired reads")
}()
log.Println("Start association of paired reads")
go func() {
for iterator.Next() {
if !reverse.Next() {
log.Panicln("Etrange reverse pas prêt")
}
newIter.Channel() <- MakePairedBioSequenceBatch(iterator.Get(),
reverse.Get())
}
newIter.Done()
}()
return newIter
}
// A function that takes a predicate and returns two IBioSequenceBatch iterators. // A function that takes a predicate and returns two IBioSequenceBatch iterators.
// Sequences extracted from the input iterator are distributed among both the // Sequences extracted from the input iterator are distributed among both the
// iterator following the predicate value. // iterator following the predicate value.
@ -599,6 +593,10 @@ func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate,
falseIter.Done() falseIter.Done()
}() }()
if iterator.IsPaired() {
trueIter.MarkAsPaired()
falseIter.MarkAsPaired()
}
return trueIter, falseIter return trueIter, falseIter
} }
@ -654,6 +652,71 @@ func (iterator IBioSequence) FilterOn(predicate obiseq.SequencePredicate,
go ff(iterator) go ff(iterator)
if iterator.IsPaired() {
trueIter.MarkAsPaired()
}
return trueIter.Rebatch(size)
}
func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
size int, sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
nworkers := 4
if len(sizes) > 0 {
nworkers = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
trueIter := MakeIBioSequence(buffsize)
trueIter.Add(nworkers)
go func() {
trueIter.WaitAndClose()
}()
ff := func(iterator IBioSequence) {
// iterator = iterator.SortBatches()
for iterator.Next() {
seqs := iterator.Get()
slice := seqs.slice
j := 0
for _, s := range slice {
good := predicate(s)
if s.IsPaired() {
good = good && predicate(s.PairedWith())
}
if good {
slice[j] = s
j++
} else {
s.Recycle()
}
}
seqs.slice = slice[:j]
trueIter.pointer.channel <- seqs
}
trueIter.Done()
}
for i := 1; i < nworkers; i++ {
go ff(iterator.Split())
}
go ff(iterator)
if iterator.IsPaired() {
trueIter.MarkAsPaired()
}
return trueIter.Rebatch(size) return trueIter.Rebatch(size)
} }
@ -673,13 +736,14 @@ func (iterator IBioSequence) Load() obiseq.BioSequenceSlice {
// It takes a slice of BioSequence objects, and returns an iterator that will return batches of // It takes a slice of BioSequence objects, and returns an iterator that will return batches of
// BioSequence objects // BioSequence objects
func IBatchOver(data obiseq.BioSequenceSlice, func IBatchOver(data obiseq.BioSequenceSlice,
size int, sizes ...int) IBioSequence { size int, sizes ...int) IBioSequence {
buffsize := 0 buffsize := 0
if len(sizes) > 0 { if len(sizes) > 0 {
buffsize = sizes[1] buffsize = sizes[0]
} }
newIter := MakeIBioSequence(buffsize) newIter := MakeIBioSequence(buffsize)
@ -706,5 +770,8 @@ func IBatchOver(data obiseq.BioSequenceSlice,
newIter.Done() newIter.Done()
}() }()
if data.IsPaired() {
newIter.MarkAsPaired()
}
return newIter return newIter
} }

95
pkg/obiiter/paired.go Normal file
View File

@ -0,0 +1,95 @@
package obiiter
import (
"log"
)
func (b BioSequenceBatch) IsPaired() bool {
return b.slice.IsPaired()
}
func (b BioSequenceBatch) PairedWith() BioSequenceBatch {
return MakeBioSequenceBatch(b.order,
*b.slice.PairedWith())
}
func (b *BioSequenceBatch) PairTo(p *BioSequenceBatch) {
if b.order != p.order {
log.Fatalf("both batches are not synchronized : (%d,%d)",
b.order, p.order,
)
}
b.slice.PairTo(&p.slice)
}
func (b *BioSequenceBatch) UnPair() {
b.slice.UnPair()
}
func (iter IBioSequence) MarkAsPaired() {
iter.pointer.paired = true
}
func (iter IBioSequence) PairTo(p IBioSequence) IBioSequence {
newIter := MakeIBioSequence()
iter = iter.SortBatches()
p = p.SortBatches()
newIter.Add(1)
go func() {
newIter.WaitAndClose()
}()
go func() {
for iter.Next() {
p.Next()
batch := iter.Get()
pbatch := p.Get()
batch.PairTo(&pbatch)
newIter.Push(batch)
}
newIter.Done()
}()
newIter.MarkAsPaired()
return newIter
}
func (iter IBioSequence) PairedWith() IBioSequence {
newIter := MakeIBioSequence()
newIter.Add(1)
go func() {
newIter.WaitAndClose()
}()
go func() {
for iter.Next() {
batch := iter.Get().PairedWith()
newIter.Push(batch)
}
newIter.Done()
}()
newIter.MarkAsPaired()
return newIter
}
func (iter IBioSequence) IsPaired() bool {
return iter.pointer.paired
}

View File

@ -1,239 +0,0 @@
package obiiter
import (
"sync"
"time"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
type PairedBioSequenceBatch struct {
forward obiseq.BioSequenceSlice
reverse obiseq.BioSequenceSlice
order int
}
var NilPairedBioSequenceBatch = PairedBioSequenceBatch{nil, nil, -1}
func MakePairedBioSequenceBatch(forward, reverse BioSequenceBatch) PairedBioSequenceBatch {
if forward.order != reverse.order {
log.Fatalf("Forward order : %d and reverse order : %d are not matching",
forward.order, reverse.order)
}
for i := range reverse.slice {
reverse.slice[i].ReverseComplement(true)
}
return PairedBioSequenceBatch{
forward: forward.slice,
reverse: reverse.slice,
order: forward.order,
}
}
func (batch PairedBioSequenceBatch) Order() int {
return batch.order
}
func (batch PairedBioSequenceBatch) Reorder(newOrder int) PairedBioSequenceBatch {
batch.order = newOrder
return batch
}
func (batch PairedBioSequenceBatch) Len() int {
return len(batch.forward)
}
func (batch PairedBioSequenceBatch) Forward() obiseq.BioSequenceSlice {
return batch.forward
}
func (batch PairedBioSequenceBatch) Reverse() obiseq.BioSequenceSlice {
return batch.reverse
}
func (batch PairedBioSequenceBatch) IsNil() bool {
return batch.forward == nil
}
// Structure implementing an iterator over bioseq.BioSequenceBatch
// based on a channel.
type __ipairedbiosequencebatch__ struct {
channel chan PairedBioSequenceBatch
current PairedBioSequenceBatch
pushBack bool
all_done *sync.WaitGroup
buffer_size int
finished bool
p_finished *bool
}
type IPairedBioSequenceBatch struct {
pointer *__ipairedbiosequencebatch__
}
var NilIPairedBioSequenceBatch = IPairedBioSequenceBatch{pointer: nil}
func MakeIPairedBioSequenceBatch(sizes ...int) IPairedBioSequenceBatch {
buffsize := 1
if len(sizes) > 0 {
buffsize = sizes[0]
}
i := __ipairedbiosequencebatch__{
channel: make(chan PairedBioSequenceBatch, buffsize),
current: NilPairedBioSequenceBatch,
pushBack: false,
buffer_size: buffsize,
finished: false,
p_finished: nil,
}
i.p_finished = &i.finished
waiting := sync.WaitGroup{}
i.all_done = &waiting
ii := IPairedBioSequenceBatch{&i}
RegisterAPipe()
return ii
}
func (iterator IPairedBioSequenceBatch) Add(n int) {
iterator.pointer.all_done.Add(n)
}
func (iterator IPairedBioSequenceBatch) Done() {
iterator.pointer.all_done.Done()
}
func (iterator IPairedBioSequenceBatch) Wait() {
iterator.pointer.all_done.Wait()
}
func (iterator IPairedBioSequenceBatch) Channel() chan PairedBioSequenceBatch {
return iterator.pointer.channel
}
func (iterator IPairedBioSequenceBatch) Close() {
close(iterator.pointer.channel)
UnregisterPipe()
}
func (iterator IPairedBioSequenceBatch) WaitAndClose() {
iterator.Wait()
for len(iterator.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
iterator.Close()
}
func (iterator IPairedBioSequenceBatch) IsNil() bool {
return iterator.pointer == nil
}
func (iterator IPairedBioSequenceBatch) BufferSize() int {
return iterator.pointer.buffer_size
}
func (iterator IPairedBioSequenceBatch) Split() IPairedBioSequenceBatch {
i := __ipairedbiosequencebatch__{
channel: iterator.pointer.channel,
current: NilPairedBioSequenceBatch,
pushBack: false,
all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size,
finished: false,
p_finished: iterator.pointer.p_finished}
newIter := IPairedBioSequenceBatch{&i}
return newIter
}
func (iterator IPairedBioSequenceBatch) Next() bool {
if *(iterator.pointer.p_finished) {
return false
}
if iterator.pointer.pushBack {
iterator.pointer.pushBack = false
return true
}
next, ok := (<-iterator.pointer.channel)
if ok {
iterator.pointer.current = next
return true
}
iterator.pointer.current = NilPairedBioSequenceBatch
*iterator.pointer.p_finished = true
return false
}
func (iterator IPairedBioSequenceBatch) PushBack() {
if !iterator.pointer.current.IsNil() {
iterator.pointer.pushBack = true
}
}
// The 'Get' method returns the instance of BioSequenceBatch
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator IPairedBioSequenceBatch) Get() PairedBioSequenceBatch {
return iterator.pointer.current
}
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator IPairedBioSequenceBatch) Finished() bool {
return *iterator.pointer.p_finished
}
func (iterator IPairedBioSequenceBatch) SortBatches(sizes ...int) IPairedBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
newIter := MakeIPairedBioSequenceBatch(buffsize)
newIter.Add(1)
go func() {
newIter.Wait()
close(newIter.pointer.channel)
}()
next_to_send := 0
received := make(map[int]PairedBioSequenceBatch)
go func() {
for iterator.Next() {
batch := iterator.Get()
if batch.order == next_to_send {
newIter.pointer.channel <- batch
next_to_send++
batch, ok := received[next_to_send]
for ok {
newIter.pointer.channel <- batch
delete(received, next_to_send)
next_to_send++
batch, ok = received[next_to_send]
}
} else {
received[batch.order] = batch
}
}
newIter.Done()
}()
return newIter
}

View File

@ -40,5 +40,10 @@ func (input IBioSequence) CopyTee() (IBioSequence, IBioSequence) {
} }
}() }()
if input.IsPaired() {
first.MarkAsPaired()
second.MarkAsPaired()
}
return first, second return first, second
} }

View File

@ -3,6 +3,7 @@ package obiiter
import ( import (
"fmt" "fmt"
"os" "os"
"time"
"github.com/schollz/progressbar/v3" "github.com/schollz/progressbar/v3"
) )
@ -45,18 +46,29 @@ func (iterator IBioSequence) Speed(message ...string) IBioSequence {
bar := progressbar.NewOptions(-1, pbopt...) bar := progressbar.NewOptions(-1, pbopt...)
go func() { go func() {
c := 0
start := time.Now()
for iterator.Next() { for iterator.Next() {
batch := iterator.Get() batch := iterator.Get()
l := batch.Len() c += batch.Len()
newIter.Push(batch) newIter.Push(batch)
bar.Add(l) elapsed := time.Since(start)
if elapsed > (time.Millisecond * 100) {
bar.Add(c)
c = 0
start = time.Now()
}
} }
fmt.Fprintln(os.Stderr) fmt.Fprintln(os.Stderr)
newIter.Done() newIter.Done()
}() }()
if iterator.IsPaired() {
newIter.MarkAsPaired()
}
return newIter return newIter
} }

View File

@ -54,6 +54,10 @@ func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int)
} }
go f(iterator) go f(iterator)
if iterator.IsPaired() {
newIter.MarkAsPaired()
}
return newIter return newIter
} }
@ -99,6 +103,10 @@ func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePre
} }
go f(iterator) go f(iterator)
if iterator.IsPaired() {
newIter.MarkAsPaired()
}
return newIter return newIter
} }
@ -138,6 +146,10 @@ func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, size
} }
go f(iterator) go f(iterator)
if iterator.IsPaired() {
newIter.MarkAsPaired()
}
return newIter return newIter
} }

View File

@ -11,7 +11,7 @@ import (
) )
var _Debug = false var _Debug = false
var _ParallelWorkers = runtime.NumCPU() - 1 var _ParallelWorkers = runtime.NumCPU() * 2 - 1
var _MaxAllowedCPU = runtime.NumCPU() var _MaxAllowedCPU = runtime.NumCPU()
var _BufferSize = 1 var _BufferSize = 1
var _BatchSize = 5000 var _BatchSize = 5000
@ -19,7 +19,10 @@ var _BatchSize = 5000
type ArgumentParser func([]string) (*getoptions.GetOpt, []string, error) type ArgumentParser func([]string) (*getoptions.GetOpt, []string, error)
func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser { func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser {
options := getoptions.New() options := getoptions.New()
options.SetMode(getoptions.Bundling)
options.SetUnknownMode(getoptions.Fail)
options.Bool("help", false, options.Alias("h", "?")) options.Bool("help", false, options.Alias("h", "?"))
options.BoolVar(&_Debug, "debug", false) options.BoolVar(&_Debug, "debug", false)
@ -28,6 +31,7 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
options.Description("Number of parallele threads computing the result")) options.Description("Number of parallele threads computing the result"))
options.IntVar(&_MaxAllowedCPU, "max-cpu", _MaxAllowedCPU, options.IntVar(&_MaxAllowedCPU, "max-cpu", _MaxAllowedCPU,
options.GetEnv("OBIMAXCPU"),
options.Description("Number of parallele threads computing the result")) options.Description("Number of parallele threads computing the result"))
for _, o := range optionset { for _, o := range optionset {
@ -42,6 +46,10 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
runtime.GOMAXPROCS(_MaxAllowedCPU) runtime.GOMAXPROCS(_MaxAllowedCPU)
if options.Called("max-cpu") { if options.Called("max-cpu") {
log.Printf("CPU number limited to %d", _MaxAllowedCPU) log.Printf("CPU number limited to %d", _MaxAllowedCPU)
if ! options.Called("workers") {
_ParallelWorkers=_MaxAllowedCPU * 2 - 1
log.Printf("Number of workers set %d", _ParallelWorkers)
}
} }
if options.Called("no-singleton") { if options.Called("no-singleton") {

View File

@ -55,6 +55,7 @@ type BioSequence struct {
sequence []byte // The sequence itself, it is accessible by the methode Sequence sequence []byte // The sequence itself, it is accessible by the methode Sequence
qualities []byte // The quality scores of the sequence. qualities []byte // The quality scores of the sequence.
feature []byte feature []byte
paired *BioSequence // A pointer to the paired sequence
annotations Annotation annotations Annotation
} }
@ -69,6 +70,7 @@ func MakeEmptyBioSequence() BioSequence {
sequence: nil, sequence: nil,
qualities: nil, qualities: nil,
feature: nil, feature: nil,
paired: nil,
annotations: nil, annotations: nil,
} }
} }
@ -275,3 +277,4 @@ func (s *BioSequence) WriteByte(data byte) error {
func (s *BioSequence) Clear() { func (s *BioSequence) Clear() {
s.sequence = s.sequence[0:0] s.sequence = s.sequence[0:0]
} }

View File

@ -23,7 +23,7 @@ func NewBioSequenceSlice(size ...int) *BioSequenceSlice {
slice := _BioSequenceSlicePool.Get().(*BioSequenceSlice) slice := _BioSequenceSlicePool.Get().(*BioSequenceSlice)
if len(size) > 0 { if len(size) > 0 {
s := size[0] s := size[0]
slice.InsureCapacity(s) slice = slice.InsureCapacity(s)
(*slice)=(*slice)[0:s] (*slice)=(*slice)[0:s]
} }
return slice return slice

View File

@ -0,0 +1,58 @@
package obiseq
import log "github.com/sirupsen/logrus"
func (s *BioSequence) IsPaired() bool {
return s.paired != nil
}
func (s *BioSequence) PairedWith() *BioSequence {
return s.paired
}
func (s *BioSequence) PairTo(p *BioSequence) {
s.paired = p
if p != nil {
p.paired = s
}
}
func (s *BioSequence) UnPair() {
if s.paired != nil {
s.paired.paired = nil
}
s.paired = nil
}
func (s *BioSequenceSlice) IsPaired() bool {
return (*s)[0].paired != nil
}
func (s *BioSequenceSlice) PairedWith() *BioSequenceSlice {
ps := NewBioSequenceSlice(len(*s))
for i, seq := range *s {
(*ps)[i] = seq.PairedWith()
}
return ps
}
func (s *BioSequenceSlice) PairTo(p *BioSequenceSlice) {
if len(*s) != len(*p) {
log.Fatalf("Pairing of iterators: both batches have not the same length : (%d,%d)",
len(*s), len(*p),
)
}
for i, seq := range *s {
seq.PairTo((*p)[i])
}
}
func (s *BioSequenceSlice) UnPair() {
for _, seq := range *s {
seq.UnPair()
}
}

View File

@ -11,6 +11,61 @@ import (
type SequencePredicate func(*BioSequence) bool type SequencePredicate func(*BioSequence) bool
type SeqPredicateMode int
const (
ForwardOnly SeqPredicateMode = iota
ReverseOnly
And
Or
AndNot
Xor
)
func (predicate SequencePredicate) PredicateOnPaired(ifnotpaired bool) SequencePredicate {
if predicate == nil {
return nil
}
p := func(sequence *BioSequence) bool {
if sequence.IsPaired() {
return predicate(sequence.PairedWith())
}
return ifnotpaired
}
return p
}
func (predicate SequencePredicate) PairedPredicat(mode SeqPredicateMode) SequencePredicate {
if predicate == nil {
return nil
}
p := func(sequence *BioSequence) bool {
good := predicate(sequence)
if sequence.IsPaired() && mode != ForwardOnly {
pgood := predicate(sequence.PairedWith())
switch mode {
case ReverseOnly:
good = pgood
case And:
good = good && pgood
case Or:
good = good || pgood
case AndNot:
good = good && !pgood
case Xor:
good = (good || pgood) && !(good && pgood)
}
}
return good
}
return p
}
func (predicate1 SequencePredicate) And(predicate2 SequencePredicate) SequencePredicate { func (predicate1 SequencePredicate) And(predicate2 SequencePredicate) SequencePredicate {
switch { switch {
case predicate1 == nil: case predicate1 == nil:
@ -225,4 +280,3 @@ func ExpressionPredicat(expression string) SequencePredicate {
return f return f
} }

View File

@ -27,6 +27,9 @@ var __output_solexa_quality__ = false
var __no_progress_bar__ = false var __no_progress_bar__ = false
var __compressed__ = false var __compressed__ = false
var __output_file_name__ = "-"
var __paired_file_name__ = ""
func InputOptionSet(options *getoptions.GetOpt) { func InputOptionSet(options *getoptions.GetOpt) {
// options.IntVar(&__skipped_entries__, "skip", __skipped_entries__, // options.IntVar(&__skipped_entries__, "skip", __skipped_entries__,
// options.Description("The N first sequence records of the file are discarded from the analysis and not reported to the output file.")) // options.Description("The N first sequence records of the file are discarded from the analysis and not reported to the output file."))
@ -73,15 +76,29 @@ func OutputOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__no_progress_bar__, "no-progressbar", false, options.BoolVar(&__no_progress_bar__, "no-progressbar", false,
options.Description("Disable the progress bar printing")) options.Description("Disable the progress bar printing"))
options.BoolVar(&__compressed__, "--compress", false, options.BoolVar(&__compressed__, "compress", false,
options.Alias("Z"), options.Alias("Z"),
options.Description("Output is compressed")) options.Description("Output is compressed"))
options.StringVar(&__output_file_name__, "out", __output_file_name__,
options.Alias("o"),
options.ArgName("FILENAME"),
options.Description("Filename used for saving the output"),
)
}
func PairedFilesOptionSet(options *getoptions.GetOpt) {
options.StringVar(&__paired_file_name__, "paired-with", __paired_file_name__,
options.ArgName("FILENAME"),
options.Description("Filename containing the paired reads"),
)
} }
func OptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) {
InputOptionSet(options) InputOptionSet(options)
OutputOptionSet(options) OutputOptionSet(options)
PairedFilesOptionSet(options)
} }
// Returns true if the number of reads described in the // Returns true if the number of reads described in the
@ -170,3 +187,14 @@ func CLIOutputQualityShift() int {
func CLIProgressBar() bool { func CLIProgressBar() bool {
return !__no_progress_bar__ return !__no_progress_bar__
} }
func CLIOutPutFileName() string {
return __output_file_name__
}
func CLIHasPairedFile() bool {
return __paired_file_name__ != ""
}
func CLIPairedFileName() string {
return __paired_file_name__
}

View File

@ -67,7 +67,7 @@ func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
return list_of_files, nil return list_of_files, nil
} }
func ReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) { func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
var iterator obiiter.IBioSequence var iterator obiiter.IBioSequence
var reader func(string, ...obiformats.WithOption) (obiiter.IBioSequence, error) var reader func(string, ...obiformats.WithOption) (obiiter.IBioSequence, error)
@ -142,6 +142,17 @@ func ReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
if err != nil { if err != nil {
return obiiter.NilIBioSequence, err return obiiter.NilIBioSequence, err
} }
if CLIPairedFileName() != "" {
ip, err := reader(CLIPairedFileName(), opts...)
if err != nil {
return obiiter.NilIBioSequence, err
}
iterator = iterator.PairTo(ip)
}
} }
// list_of_files = list_of_files[1:] // list_of_files = list_of_files[1:]

View File

@ -1,6 +1,9 @@
package obiconvert package obiconvert
import ( import (
"path/filepath"
"strings"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
@ -8,6 +11,27 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
) )
func BuildPairedFileNames(filename string) (string, string) {
dir, name := filepath.Split(filename)
parts := strings.SplitN(name, ".", 2)
forward := parts[0] + "_R1"
reverse := parts[0] + "_R2"
if parts[1] != "" {
suffix := "." + parts[1]
forward += suffix
reverse += suffix
}
if dir != "" {
forward = filepath.Join(dir, forward)
reverse = filepath.Join(dir, reverse)
}
return forward, reverse
}
func CLIWriteBioSequences(iterator obiiter.IBioSequence, func CLIWriteBioSequences(iterator obiiter.IBioSequence,
terminalAction bool, filenames ...string) (obiiter.IBioSequence, error) { terminalAction bool, filenames ...string) (obiiter.IBioSequence, error) {
@ -45,7 +69,32 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
var err error var err error
// No file names are specified or it is "-" : the output is done on stdout
if CLIOutPutFileName() != "-" || (len(filenames) > 0 && filenames[0] != "-") {
var fn string
if len(filenames) == 0 { if len(filenames) == 0 {
fn = CLIOutPutFileName()
} else {
fn = filenames[0]
}
if iterator.IsPaired() {
var reverse string
fn, reverse = BuildPairedFileNames(fn)
opts = append(opts, obiformats.WritePairedReadsTo(reverse))
}
switch CLIOutputFormat() {
case "fastq":
newIter, err = obiformats.WriteFastqToFile(iterator, fn, opts...)
case "fasta":
newIter, err = obiformats.WriteFastaToFile(iterator, fn, opts...)
default:
newIter, err = obiformats.WriteSequencesToFile(iterator, fn, opts...)
}
} else {
switch CLIOutputFormat() { switch CLIOutputFormat() {
case "fastq": case "fastq":
newIter, err = obiformats.WriteFastqToStdout(iterator, opts...) newIter, err = obiformats.WriteFastqToStdout(iterator, opts...)
@ -54,15 +103,6 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
default: default:
newIter, err = obiformats.WriteSequencesToStdout(iterator, opts...) newIter, err = obiformats.WriteSequencesToStdout(iterator, opts...)
} }
} else {
switch CLIOutputFormat() {
case "fastq":
newIter, err = obiformats.WriteFastqToFile(iterator, filenames[0], opts...)
case "fasta":
newIter, err = obiformats.WriteFastaToFile(iterator, filenames[0], opts...)
default:
newIter, err = obiformats.WriteSequencesToFile(iterator, filenames[0], opts...)
}
} }
if err != nil { if err != nil {

View File

@ -8,11 +8,15 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
) )
func IFilterSequence(iterator obiiter.IBioSequence) obiiter.IBioSequence { func CLIFilterSequence(iterator obiiter.IBioSequence) obiiter.IBioSequence {
var newIter obiiter.IBioSequence var newIter obiiter.IBioSequence
predicate := CLISequenceSelectionPredicate() predicate := CLISequenceSelectionPredicate()
if obiconvert.CLIHasPairedFile() {
predicate = predicate.PairedPredicat(CLIPairedReadMode())
}
if predicate != nil { if predicate != nil {
if CLISaveDiscardedSequences() { if CLISaveDiscardedSequences() {
var discarded obiiter.IBioSequence var discarded obiiter.IBioSequence

View File

@ -40,6 +40,8 @@ var _AttributePatterns = make(map[string]string, 0)
var _InvertMatch = false var _InvertMatch = false
var _SaveRejected = "" var _SaveRejected = ""
var _PairedMode = "forward"
func TaxonomySelectionOptionSet(options *getoptions.GetOpt) { func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_Taxdump, "taxdump", _Taxdump, options.StringVar(&_Taxdump, "taxdump", _Taxdump,
@ -135,6 +137,11 @@ func SequenceSelectionOptionSet(options *getoptions.GetOpt) {
"Several -a options can be used on the same command line and in this last case, the selected "+ "Several -a options can be used on the same command line and in this last case, the selected "+
"sequence records will match all constraints.")) "sequence records will match all constraints."))
options.StringVar(&_PairedMode, "paired-mode", _PairedMode,
options.ArgName("forward|reverse|and|or|andnot|xor"),
options.Description("If paired reads are passed to obibrep, that option determines how the conditions "+
"are applied to both reads."),
)
} }
// OptionSet adds to the basic option set every options declared for // OptionSet adds to the basic option set every options declared for
@ -412,3 +419,24 @@ func CLISaveDiscardedSequences() bool {
func CLIDiscardedFileName() string { func CLIDiscardedFileName() string {
return _SaveRejected return _SaveRejected
} }
func CLIPairedReadMode() obiseq.SeqPredicateMode {
switch _PairedMode {
case "forward":
return obiseq.ForwardOnly
case "reverse":
return obiseq.ReverseOnly
case "and":
return obiseq.And
case "or":
return obiseq.Or
case "andnot":
return obiseq.AndNot
case "xor":
return obiseq.Xor
default:
log.Fatalf("Paired reads mode must be forward, reverse, and, or, andnot, or xor (%s)", _PairedMode)
}
return obiseq.ForwardOnly
}

View File

@ -6,8 +6,8 @@ import (
"github.com/DavidGamba/go-getoptions" "github.com/DavidGamba/go-getoptions"
) )
var _ForwardFiles = make([]string, 0, 10) var _ForwardFile = ""
var _ReverseFiles = make([]string, 0, 10) var _ReverseFile = ""
var _Delta = 5 var _Delta = 5
var _MinOverlap = 20 var _MinOverlap = 20
var _GapPenality = float64(2.0) var _GapPenality = float64(2.0)
@ -15,15 +15,15 @@ var _WithoutStats = false
var _MinIdentity = 0.9 var _MinIdentity = 0.9
func PairingOptionSet(options *getoptions.GetOpt) { func PairingOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_ForwardFiles, "forward-reads", options.StringVar(&_ForwardFile, "forward-reads", "",
1, 1000,
options.Alias("F"), options.Alias("F"),
options.Required("You must provide at least one forward file"), options.ArgName("FILENAME_F"),
options.Required("You must provide at a forward file"),
options.Description("The file names containing the forward reads")) options.Description("The file names containing the forward reads"))
options.StringSliceVar(&_ReverseFiles, "reverse-reads", options.StringVar(&_ReverseFile, "reverse-reads", "",
1, 1000,
options.Alias("R"), options.Alias("R"),
options.Required("You must provide at least one reverse file"), options.ArgName("FILENAME_R"),
options.Required("You must provide a reverse file"),
options.Description("The file names containing the reverse reads")) options.Description("The file names containing the reverse reads"))
options.IntVar(&_Delta, "delta", _Delta, options.IntVar(&_Delta, "delta", _Delta,
options.Alias("D"), options.Alias("D"),
@ -42,42 +42,43 @@ func PairingOptionSet(options *getoptions.GetOpt) {
} }
func OptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options) obiconvert.OutputOptionSet(options)
obiconvert.InputOptionSet(options)
PairingOptionSet(options) PairingOptionSet(options)
} }
func IBatchPairedSequence() (obiiter.IPairedBioSequenceBatch, error) { func CLIPairedSequence() (obiiter.IBioSequence, error) {
forward, err := obiconvert.ReadBioSequences(_ForwardFiles...) forward, err := obiconvert.CLIReadBioSequences(_ForwardFile)
if err != nil { if err != nil {
return obiiter.NilIPairedBioSequenceBatch, err return obiiter.NilIBioSequence, err
} }
reverse, err := obiconvert.ReadBioSequences(_ReverseFiles...) reverse, err := obiconvert.CLIReadBioSequences(_ReverseFile)
if err != nil { if err != nil {
return obiiter.NilIPairedBioSequenceBatch, err return obiiter.NilIBioSequence, err
} }
paired := forward.PairWith(reverse) paired := forward.PairTo(reverse)
return paired, nil return paired, nil
} }
func Delta() int { func CLIDelta() int {
return _Delta return _Delta
} }
func MinOverlap() int { func CLIMinOverlap() int {
return _MinOverlap return _MinOverlap
} }
func MinIdentity() float64 { func CLIMinIdentity() float64 {
return _MinIdentity return _MinIdentity
} }
func GapPenality() float64 { func CLIGapPenality() float64 {
return _GapPenality return _GapPenality
} }
func WithStats() bool { func CLIWithStats() bool {
return !_WithoutStats return !_WithoutStats
} }

View File

@ -3,12 +3,12 @@ package obipairing
import ( import (
"math" "math"
"os" "os"
"runtime"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"github.com/schollz/progressbar/v3" "github.com/schollz/progressbar/v3"
) )
@ -203,12 +203,16 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
// //
// The function returns an iterator over batches of obiseq.Biosequence object. // The function returns an iterator over batches of obiseq.Biosequence object.
// each pair of processed sequences produces one sequence in the result iterator. // each pair of processed sequences produces one sequence in the result iterator.
func IAssemblePESequencesBatch(iterator obiiter.IPairedBioSequenceBatch, func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
gap float64, delta, minOverlap int, gap float64, delta, minOverlap int,
minIdentity float64, minIdentity float64,
withStats bool, sizes ...int) obiiter.IBioSequence { withStats bool, sizes ...int) obiiter.IBioSequence {
nworkers := runtime.NumCPU() * 3 / 2 if !iterator.IsPaired() {
log.Fatalln("Sequence data must be paired")
}
nworkers := obioptions.CLIMaxCPU() * 3 / 2
buffsize := iterator.BufferSize() buffsize := iterator.BufferSize()
if len(sizes) > 0 { if len(sizes) > 0 {
@ -236,15 +240,15 @@ func IAssemblePESequencesBatch(iterator obiiter.IPairedBioSequenceBatch,
progressbar.OptionShowIts(), progressbar.OptionShowIts(),
progressbar.OptionSetDescription("[Sequence Pairing]")) progressbar.OptionSetDescription("[Sequence Pairing]"))
f := func(iterator obiiter.IPairedBioSequenceBatch, wid int) { f := func(iterator obiiter.IBioSequence, wid int) {
arena := obialign.MakePEAlignArena(150, 150) arena := obialign.MakePEAlignArena(150, 150)
for iterator.Next() { for iterator.Next() {
batch := iterator.Get() batch := iterator.Get()
cons := make(obiseq.BioSequenceSlice, len(batch.Forward())) cons := make(obiseq.BioSequenceSlice, len(batch.Slice()))
processed := 0 processed := 0
for i, A := range batch.Forward() { for i, A := range batch.Slice() {
B := batch.Reverse()[i] B := A.PairedWith()
cons[i] = AssemblePESequences(A, B, gap, delta, minOverlap, minIdentity, withStats, true, arena) cons[i] = AssemblePESequences(A, B, gap, delta, minOverlap, minIdentity, withStats, true, arena)
if i%59 == 0 { if i%59 == 0 {
bar.Add(59) bar.Add(59)