Work on iterators and recycling of biosequences

This commit is contained in:
2022-01-14 23:11:36 +01:00
parent ef66ca4972
commit e8fff6477b
22 changed files with 350 additions and 111 deletions

View File

@ -13,6 +13,6 @@ func main() {
_, args, _ := optionParser(os.Args) _, args, _ := optionParser(os.Args)
fs, _ := obiconvert.ReadBioSequences(args...) fs, _ := obiconvert.ReadBioSequencesBatch(args...)
obiconvert.WriteBioSequences(fs) obiconvert.WriteBioSequencesBatch(fs,true)
} }

View File

@ -4,7 +4,6 @@ import (
"fmt" "fmt"
"log" "log"
"os" "os"
"runtime/trace"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obicount" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obicount"
@ -21,12 +20,12 @@ func main() {
// pprof.StartCPUProfile(f) // pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile() // defer pprof.StopCPUProfile()
ftrace, err := os.Create("cpu.trace") // ftrace, err := os.Create("cpu.trace")
if err != nil { // if err != nil {
log.Fatal(err) // log.Fatal(err)
} // }
trace.Start(ftrace) // trace.Start(ftrace)
defer trace.Stop() // defer trace.Stop()
optionParser := obioptions.GenerateOptionParser( optionParser := obioptions.GenerateOptionParser(
obiconvert.InputOptionSet, obiconvert.InputOptionSet,
@ -47,6 +46,7 @@ func main() {
nread += s.Count() nread += s.Count()
nvariant++ nvariant++
nsymbol += s.Length() nsymbol += s.Length()
(&s).Recycle()
} }
if obicount.IsPrintingVariantCount() { if obicount.IsPrintingVariantCount() {

View File

@ -1,24 +1,22 @@
package main package main
import ( import (
"log"
"os" "os"
"runtime/pprof"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obipairing" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obipairing"
) )
func main() { func main() {
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof // go tool pprof -http=":8000" ./obipairing ./cpu.pprof
f, err := os.Create("cpu.pprof") // f, err := os.Create("cpu.pprof")
if err != nil { // if err != nil {
log.Fatal(err) // log.Fatal(err)
} // }
pprof.StartCPUProfile(f) // pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile() // defer pprof.StopCPUProfile()
// go tool trace cpu.trace // go tool trace cpu.trace
// ftrace, err := os.Create("cpu.trace") // ftrace, err := os.Create("cpu.trace")
@ -33,6 +31,5 @@ func main() {
optionParser(os.Args) optionParser(os.Args)
pairs, _ := obipairing.IBatchPairedSequence() pairs, _ := obipairing.IBatchPairedSequence()
paired := obipairing.IAssemblePESequencesBatch(pairs, 2, 50, 20, true) paired := obipairing.IAssemblePESequencesBatch(pairs, 2, 50, 20, true)
written, _ := obiformats.WriteFastqBatchToStdout(paired) obiconvert.WriteBioSequencesBatch(paired, true)
written.Destroy()
} }

View File

@ -30,5 +30,5 @@ func main() {
sequences, _ := obiconvert.ReadBioSequencesBatch(args...) sequences, _ := obiconvert.ReadBioSequencesBatch(args...)
amplicons, _ := obipcr.PCR(sequences) amplicons, _ := obipcr.PCR(sequences)
obiconvert.WriteBioSequences(amplicons) obiconvert.WriteBioSequencesBatch(amplicons,true)
} }

View File

@ -59,4 +59,9 @@ func main() {
sA.ReverseComplement(true) sA.ReverseComplement(true)
fmt.Println(string(sA.Sequence())) fmt.Println(string(sA.Sequence()))
fmt.Println(string(sA.Id())) fmt.Println(string(sA.Id()))
sA.Reset()
fmt.Println(sA.Length())
fmt.Println(sA.String())
} }

View File

@ -282,6 +282,11 @@ func PEAlign(seqA, seqB obiseq.BioSequence,
_InitDNAScoreMatrix() _InitDNAScoreMatrix()
} }
// log.Println("==============")
// log.Println(seqA.String())
// log.Println(seqB.String())
// log.Println("--------------")
index := obikmer.Index4mer(seqA, index := obikmer.Index4mer(seqA,
&arena.pointer.fastIndex, &arena.pointer.fastIndex,
&arena.pointer.fastBuffer) &arena.pointer.fastBuffer)
@ -294,6 +299,10 @@ func PEAlign(seqA, seqB obiseq.BioSequence,
over = seqB.Length() + shift over = seqB.Length() + shift
} }
// log.Println(seqA.String())
// log.Println(seqB.String())
// log.Printf("Shift : %d Score : %d Over : %d La : %d:%d Lb: %d:%d\n", shift, fastScore, over, seqA.Length(), len(seqA.Qualities()), seqB.Length(), len(seqB.Qualities()))
if fastScore+3 < over { if fastScore+3 < over {
if shift > 0 { if shift > 0 {
startA = shift - delta startA = shift - delta

View File

@ -241,7 +241,7 @@ func _Pcr(seq ApatSequence, sequence obiseq.BioSequence,
match, _ := sequence.Subsequence(fm[0], fm[1], opt.pointer.circular) match, _ := sequence.Subsequence(fm[0], fm[1], opt.pointer.circular)
annot["forward_match"] = match.String() annot["forward_match"] = match.String()
match.Destroy() (&match).Recycle()
annot["forward_error"] = erri annot["forward_error"] = erri
@ -249,7 +249,7 @@ func _Pcr(seq ApatSequence, sequence obiseq.BioSequence,
match, _ = sequence.Subsequence(rm[0], rm[1], opt.pointer.circular) match, _ = sequence.Subsequence(rm[0], rm[1], opt.pointer.circular)
match = match.ReverseComplement(true) match = match.ReverseComplement(true)
annot["reverse_match"] = match.String() annot["reverse_match"] = match.String()
match.Destroy() (&match).Recycle()
annot["reverse_error"] = errj annot["reverse_error"] = errj
results = append(results, amplicon) results = append(results, amplicon)
@ -315,14 +315,14 @@ func _Pcr(seq ApatSequence, sequence obiseq.BioSequence,
match, _ := sequence.Subsequence(rm[0], rm[1], opt.pointer.circular) match, _ := sequence.Subsequence(rm[0], rm[1], opt.pointer.circular)
match.ReverseComplement(true) match.ReverseComplement(true)
annot["forward_match"] = match.String() annot["forward_match"] = match.String()
match.Destroy() (&match).Recycle()
annot["forward_error"] = errj annot["forward_error"] = errj
annot["reverse_primer"] = reverse.String() annot["reverse_primer"] = reverse.String()
match, _ = sequence.Subsequence(fm[0], fm[1], opt.pointer.circular) match, _ = sequence.Subsequence(fm[0], fm[1], opt.pointer.circular)
annot["reverse_match"] = match.String() annot["reverse_match"] = match.String()
match.Destroy() (&match).Recycle()
annot["reverse_error"] = erri annot["reverse_error"] = erri
results = append(results, amplicon) results = append(results, amplicon)

View File

@ -7,7 +7,6 @@ import (
"log" "log"
"os" "os"
"strings" "strings"
"sync"
"time" "time"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
@ -82,26 +81,30 @@ func WriteFastaToStdout(iterator obiseq.IBioSequence, options ...WithOption) err
return WriteFasta(iterator, os.Stdout, options...) return WriteFasta(iterator, os.Stdout, options...)
} }
func WriteFastaBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options ...WithOption) error { func WriteFastaBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options ...WithOption) (obiseq.IBioSequenceBatch, error) {
opt := MakeOptions(options)
buffsize := iterator.BufferSize() buffsize := iterator.BufferSize()
newIter := obiseq.MakeIBioSequenceBatch(buffsize) newIter := obiseq.MakeIBioSequenceBatch(buffsize)
opt := MakeOptions(options) nwriters := opt.ParallelWorkers()
nwriters := 4
chunkchan := make(chan FileChunck) chunkchan := make(chan FileChunck)
chunkwait := sync.WaitGroup{}
header_format := opt.FormatFastSeqHeader() header_format := opt.FormatFastSeqHeader()
chunkwait.Add(nwriters) newIter.Add(nwriters)
go func() { go func() {
chunkwait.Wait() newIter.Wait()
for len(chunkchan) > 0 { for len(chunkchan) > 0 {
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
} }
close(chunkchan) close(chunkchan)
for len(newIter.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
close(newIter.Channel())
}() }()
ff := func(iterator obiseq.IBioSequenceBatch) { ff := func(iterator obiseq.IBioSequenceBatch) {
@ -116,9 +119,11 @@ func WriteFastaBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options
newIter.Done() newIter.Done()
} }
for i := 0; i < nwriters; i++ { log.Println("Start of the fasta file writing")
for i := 0; i < nwriters-1; i++ {
go ff(iterator.Split()) go ff(iterator.Split())
} }
go ff(iterator)
next_to_send := 0 next_to_send := 0
received := make(map[int]FileChunck, 100) received := make(map[int]FileChunck, 100)
@ -142,22 +147,22 @@ func WriteFastaBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options
} }
}() }()
return nil return newIter, nil
} }
func WriteFastaBatchToStdout(iterator obiseq.IBioSequenceBatch, options ...WithOption) error { func WriteFastaBatchToStdout(iterator obiseq.IBioSequenceBatch, options ...WithOption) (obiseq.IBioSequenceBatch, error) {
return WriteFastaBatch(iterator, os.Stdout, options...) return WriteFastaBatch(iterator, os.Stdout, options...)
} }
func WriteFastaBatchToFile(iterator obiseq.IBioSequenceBatch, func WriteFastaBatchToFile(iterator obiseq.IBioSequenceBatch,
filename string, filename string,
options ...WithOption) error { options ...WithOption) (obiseq.IBioSequenceBatch, error) {
file, err := os.Create(filename) file, err := os.Create(filename)
if err != nil { if err != nil {
log.Fatalf("open file error: %v", err) log.Fatalf("open file error: %v", err)
return err return obiseq.NilIBioSequenceBatch, err
} }
return WriteFastaBatch(iterator, file, options...) return WriteFastaBatch(iterator, file, options...)

View File

@ -82,11 +82,12 @@ type FileChunck struct {
} }
func WriteFastqBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options ...WithOption) (obiseq.IBioSequenceBatch, error) { func WriteFastqBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options ...WithOption) (obiseq.IBioSequenceBatch, error) {
opt := MakeOptions(options)
buffsize := iterator.BufferSize() buffsize := iterator.BufferSize()
newIter := obiseq.MakeIBioSequenceBatch(buffsize) newIter := obiseq.MakeIBioSequenceBatch(buffsize)
opt := MakeOptions(options) nwriters := opt.ParallelWorkers()
nwriters := 4
chunkchan := make(chan FileChunck) chunkchan := make(chan FileChunck)
@ -110,19 +111,21 @@ func WriteFastqBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options
ff := func(iterator obiseq.IBioSequenceBatch) { ff := func(iterator obiseq.IBioSequenceBatch) {
for iterator.Next() { for iterator.Next() {
batch := iterator.Get() batch := iterator.Get()
chunkchan <- FileChunck{ chunk := FileChunck{
FormatFastqBatch(batch, quality, header_format), FormatFastqBatch(batch, quality, header_format),
batch.Order(), batch.Order(),
} }
chunkchan <- chunk
newIter.Channel() <- batch newIter.Channel() <- batch
} }
newIter.Done() newIter.Done()
} }
log.Println("Start of the fastq file reading") log.Println("Start of the fastq file writing")
for i := 0; i < nwriters; i++ { for i := 0; i < nwriters-1; i++ {
go ff(iterator.Split()) go ff(iterator.Split())
} }
go ff(iterator)
next_to_send := 0 next_to_send := 0
received := make(map[int]FileChunck, 100) received := make(map[int]FileChunck, 100)

View File

@ -52,28 +52,45 @@ func WriteSequencesToStdout(iterator obiseq.IBioSequence, options ...WithOption)
return WriteSequences(iterator, os.Stdout, options...) return WriteSequences(iterator, os.Stdout, options...)
} }
// func WriteSequenceBatch(iterator obiseq.IBioSequenceBatch, func WriteSequenceBatch(iterator obiseq.IBioSequenceBatch,
// file io.Writer, file io.Writer,
// options ...WithOption) error { options ...WithOption) (obiseq.IBioSequenceBatch,error) {
// opts := MakeOptions(options) var newIter obiseq.IBioSequenceBatch
var err error
// header_format := opts.FormatFastSeqHeader() ok := iterator.Next()
// quality := opts.QualityShift()
// ok := iterator.Next() if ok {
iterator.PushBack()
batch := iterator.Get()
if batch.Slice()[0].HasQualities() {
newIter,err = WriteFastqBatch(iterator, file, options...)
} else {
newIter,err = WriteFastaBatch(iterator, file, options...)
}
// if ok { return newIter,err
// batch := iterator.Get() }
// if batch.Slice()[0].HasQualities() {
// file.Write()
// fmt.Fprintln(file, FormatFastq(seq, quality, header_format))
// WriteFastq(iterator, file, options...)
// } else {
// fmt.Fprintln(file, FormatFasta(seq, header_format))
// WriteFasta(iterator, file, options...)
// }
// }
// return nil return obiseq.NilIBioSequenceBatch,fmt.Errorf("input iterator not ready")
// } }
func WriteSequencesBatchToStdout(iterator obiseq.IBioSequenceBatch,
options ...WithOption) (obiseq.IBioSequenceBatch,error) {
return WriteSequenceBatch(iterator, os.Stdout, options...)
}
func WriteSequencesBatchToFile(iterator obiseq.IBioSequenceBatch,
filename string,
options ...WithOption) (obiseq.IBioSequenceBatch,error) {
file, err := os.Create(filename)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiseq.NilIBioSequenceBatch, err
}
return WriteSequenceBatch(iterator, file, options...)
}

View File

@ -3,18 +3,26 @@ package obioptions
import ( import (
"fmt" "fmt"
"os" "os"
"runtime"
"github.com/DavidGamba/go-getoptions" "github.com/DavidGamba/go-getoptions"
) )
var __debug__ = false var _Debug = false
var _ParallelWorkers = runtime.NumCPU() - 1
var _BufferSize = 1
var _BatchSize = 5000
type ArgumentParser func([]string) (*getoptions.GetOpt, []string, error) type ArgumentParser func([]string) (*getoptions.GetOpt, []string, error)
func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser { func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser {
options := getoptions.New() options := getoptions.New()
options.Bool("help", false, options.Alias("h", "?")) options.Bool("help", false, options.Alias("h", "?"))
options.BoolVar(&__debug__, "debug", false) options.BoolVar(&_Debug, "debug", false)
options.IntVar(&_ParallelWorkers, "workers", runtime.NumCPU()-1,
options.Alias("w"),
options.Description("Number of parallele threads computing the result"))
for _, o := range optionset { for _, o := range optionset {
o(options) o(options)
@ -32,15 +40,33 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
} }
} }
// Predicate indicating if the debug mode is activated // Predicate indicating if the debug mode is activated.
func IsDebugMode() bool { func IsDebugMode() bool {
return __debug__ return _Debug
} }
// ParallelWorkers returns the number of parallel workers requested by
// the command line option --workers|-w.
func ParallelWorkers() int {
return _ParallelWorkers
}
// BufferSize returns the expeted channel buffer size for obitools
func BufferSize() int {
return _BufferSize
}
// BatchSize returns the expeted size of the sequence batches
func BatchSize() int {
return _BatchSize
}
// DebugOn sets the debug mode on.
func DebugOn() { func DebugOn() {
__debug__ = true _Debug = true
} }
// DebugOff sets the debug mode off.
func DebugOff() { func DebugOff() {
__debug__ = false _Debug = false
} }

View File

@ -39,6 +39,7 @@ func (batch BioSequenceBatch) IsNil() bool {
type __ibiosequencebatch__ struct { type __ibiosequencebatch__ struct {
channel chan BioSequenceBatch channel chan BioSequenceBatch
current BioSequenceBatch current BioSequenceBatch
pushBack bool
all_done *sync.WaitGroup all_done *sync.WaitGroup
buffer_size int buffer_size int
finished bool finished bool
@ -61,9 +62,11 @@ func MakeIBioSequenceBatch(sizes ...int) IBioSequenceBatch {
i := __ibiosequencebatch__{ i := __ibiosequencebatch__{
channel: make(chan BioSequenceBatch, buffsize), channel: make(chan BioSequenceBatch, buffsize),
current: NilBioSequenceBatch, current: NilBioSequenceBatch,
pushBack: false,
buffer_size: buffsize, buffer_size: buffsize,
finished: false, finished: false,
p_finished: nil} p_finished: nil,
}
i.p_finished = &i.finished i.p_finished = &i.finished
waiting := sync.WaitGroup{} waiting := sync.WaitGroup{}
i.all_done = &waiting i.all_done = &waiting
@ -99,6 +102,7 @@ func (iterator IBioSequenceBatch) Split() IBioSequenceBatch {
i := __ibiosequencebatch__{ i := __ibiosequencebatch__{
channel: iterator.pointer.channel, channel: iterator.pointer.channel,
current: NilBioSequenceBatch, current: NilBioSequenceBatch,
pushBack: false,
all_done: iterator.pointer.all_done, all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size, buffer_size: iterator.pointer.buffer_size,
finished: false, finished: false,
@ -111,6 +115,12 @@ func (iterator IBioSequenceBatch) Next() bool {
if *(iterator.pointer.p_finished) { if *(iterator.pointer.p_finished) {
return false return false
} }
if iterator.pointer.pushBack {
iterator.pointer.pushBack = false
return true
}
next, ok := (<-iterator.pointer.channel) next, ok := (<-iterator.pointer.channel)
if ok { if ok {
@ -123,6 +133,12 @@ func (iterator IBioSequenceBatch) Next() bool {
return false return false
} }
func (iterator IBioSequenceBatch) PushBack() {
if !iterator.pointer.current.IsNil() {
iterator.pointer.pushBack = true
}
}
// The 'Get' method returns the instance of BioSequenceBatch // The 'Get' method returns the instance of BioSequenceBatch
// currently pointed by the iterator. You have to use the // currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling // 'Next' method to move to the next entry before calling
@ -303,14 +319,14 @@ func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBa
return newIter return newIter
} }
func (iterator IBioSequenceBatch) Destroy() { func (iterator IBioSequenceBatch) Recycle() {
log.Println("Start recycling of Bioseq objects") log.Println("Start recycling of Bioseq objects")
for iterator.Next() { for iterator.Next() {
batch := iterator.Get() batch := iterator.Get()
for _, seq := range batch.Slice() { for _, seq := range batch.Slice() {
(&seq).Destroy() (&seq).Recycle()
} }
} }
log.Println("End of the recycling of Bioseq objects") log.Println("End of the recycling of Bioseq objects")

View File

@ -44,7 +44,7 @@ func (s BioSequence) IsNil() bool {
return s.sequence == nil return s.sequence == nil
} }
func (s BioSequence) Reset() { func (s *BioSequence) Reset() {
s.sequence.id.Reset() s.sequence.id.Reset()
s.sequence.definition.Reset() s.sequence.definition.Reset()
s.sequence.sequence.Reset() s.sequence.sequence.Reset()
@ -168,6 +168,10 @@ func (s BioSequence) SetQualities(qualities Quality) {
s.sequence.qualities.Write(qualities) s.sequence.qualities.Write(qualities)
} }
func (s BioSequence) WriteQualities(data []byte) (int, error) {
return s.sequence.qualities.Write(data)
}
func (s BioSequence) Write(data []byte) (int, error) { func (s BioSequence) Write(data []byte) (int, error) {
return s.sequence.sequence.Write(data) return s.sequence.sequence.Write(data)
} }

View File

@ -10,10 +10,11 @@ import (
type __ibiosequence__ struct { type __ibiosequence__ struct {
channel chan BioSequence channel chan BioSequence
current BioSequence current BioSequence
pushBack bool
all_done *sync.WaitGroup all_done *sync.WaitGroup
buffer_size int buffer_size int
finished bool finished bool
p_finished *bool pFinished *bool
} }
type IBioSequence struct { type IBioSequence struct {
@ -55,10 +56,13 @@ func MakeIBioSequence(sizes ...int) IBioSequence {
i := __ibiosequence__{ i := __ibiosequence__{
channel: make(chan BioSequence, buffsize), channel: make(chan BioSequence, buffsize),
current: NilBioSequence, current: NilBioSequence,
pushBack: false,
buffer_size: buffsize, buffer_size: buffsize,
finished: false, finished: false,
p_finished: nil} pFinished: nil,
i.p_finished = &i.finished }
i.pFinished = &i.finished
waiting := sync.WaitGroup{} waiting := sync.WaitGroup{}
i.all_done = &waiting i.all_done = &waiting
ii := IBioSequence{&i} ii := IBioSequence{&i}
@ -66,23 +70,32 @@ func MakeIBioSequence(sizes ...int) IBioSequence {
} }
func (iterator IBioSequence) Split() IBioSequence { func (iterator IBioSequence) Split() IBioSequence {
i := __ibiosequence__{ i := __ibiosequence__{
channel: iterator.pointer.channel, channel: iterator.pointer.channel,
current: NilBioSequence, current: NilBioSequence,
pushBack: false,
finished: false, finished: false,
all_done: iterator.pointer.all_done, all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size, buffer_size: iterator.pointer.buffer_size,
p_finished: iterator.pointer.p_finished} pFinished: iterator.pointer.pFinished,
}
newIter := IBioSequence{&i} newIter := IBioSequence{&i}
return newIter return newIter
} }
func (iterator IBioSequence) Next() bool { func (iterator IBioSequence) Next() bool {
if iterator.IsNil() || *(iterator.pointer.p_finished) { if iterator.IsNil() || *(iterator.pointer.pFinished) {
iterator.pointer.current = NilBioSequence iterator.pointer.current = NilBioSequence
return false return false
} }
if iterator.pointer.pushBack {
iterator.pointer.pushBack = false
return true
}
next, ok := (<-iterator.pointer.channel) next, ok := (<-iterator.pointer.channel)
if ok { if ok {
@ -91,10 +104,16 @@ func (iterator IBioSequence) Next() bool {
} }
iterator.pointer.current = NilBioSequence iterator.pointer.current = NilBioSequence
*iterator.pointer.p_finished = true *iterator.pointer.pFinished = true
return false return false
} }
func (iterator IBioSequence) PushBack() {
if !iterator.pointer.current.IsNil() {
iterator.pointer.pushBack = true
}
}
// The 'Get' method returns the instance of BioSequence // The 'Get' method returns the instance of BioSequence
// currently pointed by the iterator. You have to use the // currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling // 'Next' method to move to the next entry before calling
@ -106,7 +125,7 @@ func (iterator IBioSequence) Get() BioSequence {
// Finished returns 'true' value if no more data is available // Finished returns 'true' value if no more data is available
// from the iterator. // from the iterator.
func (iterator IBioSequence) Finished() bool { func (iterator IBioSequence) Finished() bool {
return *iterator.pointer.p_finished return *iterator.pointer.pFinished
} }
func (iterator IBioSequence) BufferSize() int { func (iterator IBioSequence) BufferSize() int {

View File

@ -55,6 +55,7 @@ func (batch PairedBioSequenceBatch) IsNil() bool {
type __ipairedbiosequencebatch__ struct { type __ipairedbiosequencebatch__ struct {
channel chan PairedBioSequenceBatch channel chan PairedBioSequenceBatch
current PairedBioSequenceBatch current PairedBioSequenceBatch
pushBack bool
all_done *sync.WaitGroup all_done *sync.WaitGroup
buffer_size int buffer_size int
finished bool finished bool
@ -77,9 +78,12 @@ func MakeIPairedBioSequenceBatch(sizes ...int) IPairedBioSequenceBatch {
i := __ipairedbiosequencebatch__{ i := __ipairedbiosequencebatch__{
channel: make(chan PairedBioSequenceBatch, buffsize), channel: make(chan PairedBioSequenceBatch, buffsize),
current: NilPairedBioSequenceBatch, current: NilPairedBioSequenceBatch,
pushBack: false,
buffer_size: buffsize, buffer_size: buffsize,
finished: false, finished: false,
p_finished: nil} p_finished: nil,
}
i.p_finished = &i.finished i.p_finished = &i.finished
waiting := sync.WaitGroup{} waiting := sync.WaitGroup{}
i.all_done = &waiting i.all_done = &waiting
@ -115,6 +119,7 @@ func (iterator IPairedBioSequenceBatch) Split() IPairedBioSequenceBatch {
i := __ipairedbiosequencebatch__{ i := __ipairedbiosequencebatch__{
channel: iterator.pointer.channel, channel: iterator.pointer.channel,
current: NilPairedBioSequenceBatch, current: NilPairedBioSequenceBatch,
pushBack: false,
all_done: iterator.pointer.all_done, all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size, buffer_size: iterator.pointer.buffer_size,
finished: false, finished: false,
@ -127,6 +132,12 @@ func (iterator IPairedBioSequenceBatch) Next() bool {
if *(iterator.pointer.p_finished) { if *(iterator.pointer.p_finished) {
return false return false
} }
if iterator.pointer.pushBack {
iterator.pointer.pushBack = false
return true
}
next, ok := (<-iterator.pointer.channel) next, ok := (<-iterator.pointer.channel)
if ok { if ok {
@ -139,6 +150,12 @@ func (iterator IPairedBioSequenceBatch) Next() bool {
return false return false
} }
func (iterator IPairedBioSequenceBatch) PushBack() {
if !iterator.pointer.current.IsNil() {
iterator.pointer.pushBack = true
}
}
// The 'Get' method returns the instance of BioSequenceBatch // The 'Get' method returns the instance of BioSequenceBatch
// currently pointed by the iterator. You have to use the // currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling // 'Next' method to move to the next entry before calling

View File

@ -14,7 +14,6 @@ var __bioseq__pool__ = sync.Pool{
func MakeEmptyBioSequence() BioSequence { func MakeEmptyBioSequence() BioSequence {
bs := BioSequence{__bioseq__pool__.Get().(*__sequence__)} bs := BioSequence{__bioseq__pool__.Get().(*__sequence__)}
bs.Reset()
return bs return bs
} }
@ -23,12 +22,13 @@ func MakeBioSequence(id string,
definition string) BioSequence { definition string) BioSequence {
bs := MakeEmptyBioSequence() bs := MakeEmptyBioSequence()
bs.SetId(id) bs.SetId(id)
bs.SetSequence(sequence) bs.Write(sequence)
bs.SetDefinition(definition) bs.SetDefinition(definition)
return bs return bs
} }
func (sequence *BioSequence) Destroy() { func (sequence *BioSequence) Recycle() {
sequence.Reset()
__bioseq__pool__.Put(sequence.sequence) __bioseq__pool__.Put(sequence.sequence)
sequence.sequence = nil sequence.sequence = nil
} }

View File

@ -84,9 +84,10 @@ func (iterator IBioSequenceBatch) MakeIWorker(worker SeqWorker, sizes ...int) IB
} }
log.Println("Start of the batch workers") log.Println("Start of the batch workers")
for i := 0; i < nworkers; i++ { for i := 0; i < nworkers-1; i++ {
go f(iterator.Split()) go f(iterator.Split())
} }
go f(iterator)
return newIter return newIter
} }
@ -126,9 +127,10 @@ func (iterator IBioSequenceBatch) MakeISliceWorker(worker SeqSliceWorker, sizes
} }
log.Println("Start of the batch slice workers") log.Println("Start of the batch slice workers")
for i := 0; i < nworkers; i++ { for i := 0; i < nworkers - 1; i++ {
go f(iterator.Split()) go f(iterator.Split())
} }
go f(iterator)
return newIter return newIter
} }

View File

@ -7,10 +7,11 @@ import (
"strings" "strings"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
) )
func __expand_list_of_files__(check_ext bool, filenames ...string) ([]string, error) { func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
var err error var err error
list_of_files := make([]string, 0, 100) list_of_files := make([]string, 0, 100)
for _, fn := range filenames { for _, fn := range filenames {
@ -32,7 +33,7 @@ func __expand_list_of_files__(check_ext bool, filenames ...string) ([]string, er
if info.IsDir() { if info.IsDir() {
if path != fn { if path != fn {
subdir, e := __expand_list_of_files__(true, path) subdir, e := _ExpandListOfFiles(true, path)
if e != nil { if e != nil {
return e return e
} }
@ -80,6 +81,15 @@ func ReadBioSequencesBatch(filenames ...string) (obiseq.IBioSequenceBatch, error
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseGuessedFastSeqHeader)) opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseGuessedFastSeqHeader))
} }
nworkers := obioptions.ParallelWorkers() / 4
if nworkers < 2 {
nworkers = 2
}
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBufferSize(obioptions.BufferSize()))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.BatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(InputQualityShift())) opts = append(opts, obiformats.OptionsQualityShift(InputQualityShift()))
if len(filenames) == 0 { if len(filenames) == 0 {
@ -94,7 +104,7 @@ func ReadBioSequencesBatch(filenames ...string) (obiseq.IBioSequenceBatch, error
} }
} else { } else {
list_of_files, err := __expand_list_of_files__(false, filenames...) list_of_files, err := _ExpandListOfFiles(false, filenames...)
if err != nil { if err != nil {
return obiseq.NilIBioSequenceBatch, err return obiseq.NilIBioSequenceBatch, err
} }

View File

@ -4,6 +4,7 @@ import (
"log" "log"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
) )
@ -23,6 +24,15 @@ func WriteBioSequences(iterator obiseq.IBioSequence, filenames ...string) error
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader)) opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
} }
nworkers := obioptions.ParallelWorkers() / 4
if nworkers < 2 {
nworkers = 2
}
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBufferSize(obioptions.BufferSize()))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.BatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(OutputQualityShift())) opts = append(opts, obiformats.OptionsQualityShift(OutputQualityShift()))
var err error var err error
@ -54,3 +64,68 @@ func WriteBioSequences(iterator obiseq.IBioSequence, filenames ...string) error
return nil return nil
} }
func WriteBioSequencesBatch(iterator obiseq.IBioSequenceBatch,
terminalAction bool, filenames ...string) (obiseq.IBioSequenceBatch, error) {
var newIter obiseq.IBioSequenceBatch
opts := make([]obiformats.WithOption, 0, 10)
switch OutputFastHeaderFormat() {
case "json":
log.Println("On output use JSON headers")
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
case "obi":
log.Println("On output use OBI headers")
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqOBIHeader))
default:
log.Println("On output use JSON headers")
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
}
nworkers := obioptions.ParallelWorkers() / 4
if nworkers < 2 {
nworkers = 2
}
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBufferSize(obioptions.BufferSize()))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.BatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(OutputQualityShift()))
var err error
if len(filenames) == 0 {
switch OutputFormat() {
case "fastq":
newIter, err = obiformats.WriteFastqBatchToStdout(iterator, opts...)
case "fasta":
newIter, err = obiformats.WriteFastaBatchToStdout(iterator, opts...)
default:
newIter, err = obiformats.WriteSequencesBatchToStdout(iterator, opts...)
}
} else {
switch OutputFormat() {
case "fastq":
newIter, err = obiformats.WriteFastqBatchToFile(iterator, filenames[0], opts...)
case "fasta":
newIter, err = obiformats.WriteFastaBatchToFile(iterator, filenames[0], opts...)
default:
newIter, err = obiformats.WriteSequencesBatchToFile(iterator, filenames[0], opts...)
}
}
if err != nil {
log.Fatalf("Write file error: %v", err)
return obiseq.NilIBioSequenceBatch, err
}
if terminalAction {
newIter.Recycle()
return obiseq.NilIBioSequenceBatch, nil
}
return newIter, nil
}

View File

@ -4,6 +4,7 @@ import (
"log" "log"
"math" "math"
"os" "os"
"runtime"
"time" "time"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
@ -11,34 +12,40 @@ import (
"github.com/schollz/progressbar/v3" "github.com/schollz/progressbar/v3"
) )
func __abs__(x int) int { func _Abs(x int) int {
if x < 0 { if x < 0 {
return -x return -x
} }
return x return x
} }
func JoinPairedSequence(seqA, seqB obiseq.BioSequence) obiseq.BioSequence { func JoinPairedSequence(seqA, seqB obiseq.BioSequence, inplace bool) obiseq.BioSequence {
js := make([]byte, seqA.Length(), seqA.Length()+seqB.Length()+10)
jq := make([]byte, seqA.Length(), seqA.Length()+seqB.Length()+10)
copy(js, seqA.Sequence()) if !inplace {
copy(jq, seqA.Qualities()) seqA = seqA.Copy()
}
js = append(js, '.', '.', '.', '.', '.', '.', '.', '.', '.', '.') seqA.WriteString("..........")
jq = append(jq, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) seqA.Write(seqB.Sequence())
js = append(js, seqB.Sequence()...) seqA.WriteQualities(obiseq.Quality{0, 0, 0, 0, 0, 0, 0, 0, 0, 0})
jq = append(jq, seqB.Qualities()...) seqA.WriteQualities(seqB.Qualities())
rep := obiseq.MakeBioSequence(seqA.Id(), js, seqA.Definition()) return seqA
rep.SetQualities(jq)
return rep
} }
// AssemblePESequences assembles two paired sequences following
// the obipairing strategy implemented in obialign.PEAlign using
// the gap and delta parametters.
// If the length of the overlap between both sequences is less than
// overlap_min, The alignment is substituted by a simple pasting
// of the sequences with a strech of 10 dots in between them.
// the quality of the dots is set to 0.
// If the inplace parameter is set to true, the seqA and seqB are
// destroyed during the assembling process and cannot be reuse later on.
func AssemblePESequences(seqA, seqB obiseq.BioSequence, func AssemblePESequences(seqA, seqB obiseq.BioSequence,
gap, delta, overlap_min int, with_stats bool, gap, delta, overlap_min int, with_stats bool,
inplace bool,
arena_align obialign.PEAlignArena, arena_align obialign.PEAlignArena,
arena_cons obialign.BuildAlignArena, arena_cons obialign.BuildAlignArena,
arena_qual obialign.BuildAlignArena) obiseq.BioSequence { arena_qual obialign.BuildAlignArena) obiseq.BioSequence {
@ -53,7 +60,7 @@ func AssemblePESequences(seqA, seqB obiseq.BioSequence,
right = path[len(path)-2] right = path[len(path)-2]
} }
lcons := cons.Length() lcons := cons.Length()
ali_length := lcons - __abs__(left) - __abs__(right) ali_length := lcons - _Abs(left) - _Abs(right)
if ali_length >= overlap_min { if ali_length >= overlap_min {
if with_stats { if with_stats {
@ -85,14 +92,22 @@ func AssemblePESequences(seqA, seqB obiseq.BioSequence,
annot["seq_ab_match"] = match annot["seq_ab_match"] = match
annot["score_norm"] = score_norm annot["score_norm"] = score_norm
if inplace {
(&seqA).Recycle()
(&seqB).Recycle()
}
} }
} else { } else {
cons = JoinPairedSequence(seqA, seqB) cons = JoinPairedSequence(seqA, seqB, inplace)
if with_stats { if with_stats {
annot := cons.Annotations() annot := cons.Annotations()
annot["mode"] = "join" annot["mode"] = "join"
} }
if inplace {
(&seqB).Recycle()
}
} }
return cons return cons
@ -101,7 +116,7 @@ func AssemblePESequences(seqA, seqB obiseq.BioSequence,
func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch, func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
gap, delta, overlap_min int, with_stats bool, sizes ...int) obiseq.IBioSequenceBatch { gap, delta, overlap_min int, with_stats bool, sizes ...int) obiseq.IBioSequenceBatch {
nworkers := 7 nworkers := runtime.NumCPU() - 1
buffsize := iterator.BufferSize() buffsize := iterator.BufferSize()
if len(sizes) > 0 { if len(sizes) > 0 {
@ -148,13 +163,11 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
processed := 0 processed := 0
for i, A := range batch.Forward() { for i, A := range batch.Forward() {
B := batch.Reverse()[i] B := batch.Reverse()[i]
cons[i] = AssemblePESequences(A, B, 2, 5, 20, true, arena, barena1, barena2) cons[i] = AssemblePESequences(A, B, 2, 5, 20, true, true, arena, barena1, barena2)
if i%59 == 0 { if i%59 == 0 {
bar.Add(59) bar.Add(59)
processed += 59 processed += 59
} }
A.Destroy()
B.Destroy()
} }
bar.Add(batch.Length() - processed) bar.Add(batch.Length() - processed)
newIter.Channel() <- obiseq.MakeBioSequenceBatch( newIter.Channel() <- obiseq.MakeBioSequenceBatch(
@ -169,9 +182,10 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
log.Printf("Start of the sequence Pairing") log.Printf("Start of the sequence Pairing")
for i := 0; i < nworkers; i++ { for i := 0; i < nworkers-1; i++ {
go f(iterator.Split(), i) go f(iterator.Split(), i)
} }
go f(iterator, nworkers-1)
return newIter return newIter

View File

@ -15,6 +15,8 @@ var _AllowedMismatch = 0
var _MinimumLength = 0 var _MinimumLength = 0
var _MaximumLength = -1 var _MaximumLength = -1
// PCROptionSet adds to a command line option set every options
// needed by the PCR algorithm.
func PCROptionSet(options *getoptions.GetOpt) { func PCROptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_Circular, "circular", false, options.BoolVar(&_Circular, "circular", false,
options.Alias("c"), options.Alias("c"),
@ -40,11 +42,15 @@ func PCROptionSet(options *getoptions.GetOpt) {
options.Description("Maximum length of the barcode (primers excluded).")) options.Description("Maximum length of the barcode (primers excluded)."))
} }
// OptionSet adds to the basic option set every options declared for
// the obipcr command
func OptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options) obiconvert.OptionSet(options)
PCROptionSet(options) PCROptionSet(options)
} }
// ForwardPrimer returns the sequence of the forward primer as indicated by the
// --forward command line option
func ForwardPrimer() string { func ForwardPrimer() string {
pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch) pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch)
@ -57,6 +63,8 @@ func ForwardPrimer() string {
return _ForwardPrimer return _ForwardPrimer
} }
// ReversePrimer returns the sequence of the reverse primer as indicated by the
// --reverse command line option
func ReversePrimer() string { func ReversePrimer() string {
pattern, err := obiapat.MakeApatPattern(_ReversePrimer, _AllowedMismatch) pattern, err := obiapat.MakeApatPattern(_ReversePrimer, _AllowedMismatch)
@ -69,18 +77,27 @@ func ReversePrimer() string {
return _ReversePrimer return _ReversePrimer
} }
// AllowedMismatch returns the allowed mistmatch count between each
// primer and the sequences as indicated by the
// --allowed-mismatches|-e command line option
func AllowedMismatch() int { func AllowedMismatch() int {
return _AllowedMismatch return _AllowedMismatch
} }
// Circular returns the considered sequence topology as indicated by the
// --circular|-c command line option
func Circular() bool { func Circular() bool {
return _Circular return _Circular
} }
// MinLength returns the amplicon minimum length as indicated by the
// --min-length|-l command line option
func MinLength() int { func MinLength() int {
return _MinimumLength return _MinimumLength
} }
// MaxLength returns the amplicon maximum length as indicated by the
// --max-length|-L command line option
func MaxLength() int { func MaxLength() int {
return _MaximumLength return _MaximumLength
} }

View File

@ -5,7 +5,10 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
) )
func PCR(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequence, error) { // PCR iterates over sequences provided by a obiseq.IBioSequenceBatch
// and returns an other obiseq.IBioSequenceBatch distributing
// obiseq.BioSequenceBatch containing the selected amplicon sequences.
func PCR(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequenceBatch, error) {
forward := ForwardPrimer() forward := ForwardPrimer()
reverse := ReversePrimer() reverse := ReversePrimer()
@ -28,5 +31,5 @@ func PCR(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequence, error) {
worker := obiapat.PCRSliceWorker(forward, reverse, opts...) worker := obiapat.PCRSliceWorker(forward, reverse, opts...)
return iterator.MakeISliceWorker(worker).IBioSequence(), nil return iterator.MakeISliceWorker(worker), nil
} }