mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 16:50:27 +00:00
Work on iterators and recycling of biosequences
This commit is contained in:
@@ -7,10 +7,11 @@ import (
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
func __expand_list_of_files__(check_ext bool, filenames ...string) ([]string, error) {
|
||||
func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
|
||||
var err error
|
||||
list_of_files := make([]string, 0, 100)
|
||||
for _, fn := range filenames {
|
||||
@@ -32,7 +33,7 @@ func __expand_list_of_files__(check_ext bool, filenames ...string) ([]string, er
|
||||
|
||||
if info.IsDir() {
|
||||
if path != fn {
|
||||
subdir, e := __expand_list_of_files__(true, path)
|
||||
subdir, e := _ExpandListOfFiles(true, path)
|
||||
if e != nil {
|
||||
return e
|
||||
}
|
||||
@@ -80,6 +81,15 @@ func ReadBioSequencesBatch(filenames ...string) (obiseq.IBioSequenceBatch, error
|
||||
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseGuessedFastSeqHeader))
|
||||
}
|
||||
|
||||
nworkers := obioptions.ParallelWorkers() / 4
|
||||
if nworkers < 2 {
|
||||
nworkers = 2
|
||||
}
|
||||
|
||||
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
|
||||
opts = append(opts, obiformats.OptionsBufferSize(obioptions.BufferSize()))
|
||||
opts = append(opts, obiformats.OptionsBatchSize(obioptions.BatchSize()))
|
||||
|
||||
opts = append(opts, obiformats.OptionsQualityShift(InputQualityShift()))
|
||||
|
||||
if len(filenames) == 0 {
|
||||
@@ -94,7 +104,7 @@ func ReadBioSequencesBatch(filenames ...string) (obiseq.IBioSequenceBatch, error
|
||||
}
|
||||
} else {
|
||||
|
||||
list_of_files, err := __expand_list_of_files__(false, filenames...)
|
||||
list_of_files, err := _ExpandListOfFiles(false, filenames...)
|
||||
if err != nil {
|
||||
return obiseq.NilIBioSequenceBatch, err
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"log"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
@@ -23,6 +24,15 @@ func WriteBioSequences(iterator obiseq.IBioSequence, filenames ...string) error
|
||||
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
|
||||
}
|
||||
|
||||
nworkers := obioptions.ParallelWorkers() / 4
|
||||
if nworkers < 2 {
|
||||
nworkers = 2
|
||||
}
|
||||
|
||||
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
|
||||
opts = append(opts, obiformats.OptionsBufferSize(obioptions.BufferSize()))
|
||||
opts = append(opts, obiformats.OptionsBatchSize(obioptions.BatchSize()))
|
||||
|
||||
opts = append(opts, obiformats.OptionsQualityShift(OutputQualityShift()))
|
||||
|
||||
var err error
|
||||
@@ -54,3 +64,68 @@ func WriteBioSequences(iterator obiseq.IBioSequence, filenames ...string) error
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func WriteBioSequencesBatch(iterator obiseq.IBioSequenceBatch,
|
||||
terminalAction bool, filenames ...string) (obiseq.IBioSequenceBatch, error) {
|
||||
|
||||
var newIter obiseq.IBioSequenceBatch
|
||||
|
||||
opts := make([]obiformats.WithOption, 0, 10)
|
||||
|
||||
switch OutputFastHeaderFormat() {
|
||||
case "json":
|
||||
log.Println("On output use JSON headers")
|
||||
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
|
||||
case "obi":
|
||||
log.Println("On output use OBI headers")
|
||||
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqOBIHeader))
|
||||
default:
|
||||
log.Println("On output use JSON headers")
|
||||
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
|
||||
}
|
||||
|
||||
nworkers := obioptions.ParallelWorkers() / 4
|
||||
if nworkers < 2 {
|
||||
nworkers = 2
|
||||
}
|
||||
|
||||
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
|
||||
opts = append(opts, obiformats.OptionsBufferSize(obioptions.BufferSize()))
|
||||
opts = append(opts, obiformats.OptionsBatchSize(obioptions.BatchSize()))
|
||||
|
||||
opts = append(opts, obiformats.OptionsQualityShift(OutputQualityShift()))
|
||||
|
||||
var err error
|
||||
|
||||
if len(filenames) == 0 {
|
||||
switch OutputFormat() {
|
||||
case "fastq":
|
||||
newIter, err = obiformats.WriteFastqBatchToStdout(iterator, opts...)
|
||||
case "fasta":
|
||||
newIter, err = obiformats.WriteFastaBatchToStdout(iterator, opts...)
|
||||
default:
|
||||
newIter, err = obiformats.WriteSequencesBatchToStdout(iterator, opts...)
|
||||
}
|
||||
} else {
|
||||
switch OutputFormat() {
|
||||
case "fastq":
|
||||
newIter, err = obiformats.WriteFastqBatchToFile(iterator, filenames[0], opts...)
|
||||
case "fasta":
|
||||
newIter, err = obiformats.WriteFastaBatchToFile(iterator, filenames[0], opts...)
|
||||
default:
|
||||
newIter, err = obiformats.WriteSequencesBatchToFile(iterator, filenames[0], opts...)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Write file error: %v", err)
|
||||
return obiseq.NilIBioSequenceBatch, err
|
||||
}
|
||||
|
||||
if terminalAction {
|
||||
newIter.Recycle()
|
||||
return obiseq.NilIBioSequenceBatch, nil
|
||||
}
|
||||
|
||||
return newIter, nil
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"log"
|
||||
"math"
|
||||
"os"
|
||||
"runtime"
|
||||
"time"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
|
||||
@@ -11,34 +12,40 @@ import (
|
||||
"github.com/schollz/progressbar/v3"
|
||||
)
|
||||
|
||||
func __abs__(x int) int {
|
||||
func _Abs(x int) int {
|
||||
if x < 0 {
|
||||
return -x
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
func JoinPairedSequence(seqA, seqB obiseq.BioSequence) obiseq.BioSequence {
|
||||
js := make([]byte, seqA.Length(), seqA.Length()+seqB.Length()+10)
|
||||
jq := make([]byte, seqA.Length(), seqA.Length()+seqB.Length()+10)
|
||||
func JoinPairedSequence(seqA, seqB obiseq.BioSequence, inplace bool) obiseq.BioSequence {
|
||||
|
||||
copy(js, seqA.Sequence())
|
||||
copy(jq, seqA.Qualities())
|
||||
if !inplace {
|
||||
seqA = seqA.Copy()
|
||||
}
|
||||
|
||||
js = append(js, '.', '.', '.', '.', '.', '.', '.', '.', '.', '.')
|
||||
jq = append(jq, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
seqA.WriteString("..........")
|
||||
seqA.Write(seqB.Sequence())
|
||||
|
||||
js = append(js, seqB.Sequence()...)
|
||||
jq = append(jq, seqB.Qualities()...)
|
||||
seqA.WriteQualities(obiseq.Quality{0, 0, 0, 0, 0, 0, 0, 0, 0, 0})
|
||||
seqA.WriteQualities(seqB.Qualities())
|
||||
|
||||
rep := obiseq.MakeBioSequence(seqA.Id(), js, seqA.Definition())
|
||||
rep.SetQualities(jq)
|
||||
|
||||
return rep
|
||||
return seqA
|
||||
}
|
||||
|
||||
// AssemblePESequences assembles two paired sequences following
|
||||
// the obipairing strategy implemented in obialign.PEAlign using
|
||||
// the gap and delta parametters.
|
||||
// If the length of the overlap between both sequences is less than
|
||||
// overlap_min, The alignment is substituted by a simple pasting
|
||||
// of the sequences with a strech of 10 dots in between them.
|
||||
// the quality of the dots is set to 0.
|
||||
// If the inplace parameter is set to true, the seqA and seqB are
|
||||
// destroyed during the assembling process and cannot be reuse later on.
|
||||
func AssemblePESequences(seqA, seqB obiseq.BioSequence,
|
||||
gap, delta, overlap_min int, with_stats bool,
|
||||
inplace bool,
|
||||
arena_align obialign.PEAlignArena,
|
||||
arena_cons obialign.BuildAlignArena,
|
||||
arena_qual obialign.BuildAlignArena) obiseq.BioSequence {
|
||||
@@ -53,7 +60,7 @@ func AssemblePESequences(seqA, seqB obiseq.BioSequence,
|
||||
right = path[len(path)-2]
|
||||
}
|
||||
lcons := cons.Length()
|
||||
ali_length := lcons - __abs__(left) - __abs__(right)
|
||||
ali_length := lcons - _Abs(left) - _Abs(right)
|
||||
|
||||
if ali_length >= overlap_min {
|
||||
if with_stats {
|
||||
@@ -85,14 +92,22 @@ func AssemblePESequences(seqA, seqB obiseq.BioSequence,
|
||||
annot["seq_ab_match"] = match
|
||||
annot["score_norm"] = score_norm
|
||||
|
||||
if inplace {
|
||||
(&seqA).Recycle()
|
||||
(&seqB).Recycle()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cons = JoinPairedSequence(seqA, seqB)
|
||||
cons = JoinPairedSequence(seqA, seqB, inplace)
|
||||
|
||||
if with_stats {
|
||||
annot := cons.Annotations()
|
||||
annot["mode"] = "join"
|
||||
}
|
||||
|
||||
if inplace {
|
||||
(&seqB).Recycle()
|
||||
}
|
||||
}
|
||||
|
||||
return cons
|
||||
@@ -101,7 +116,7 @@ func AssemblePESequences(seqA, seqB obiseq.BioSequence,
|
||||
func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
|
||||
gap, delta, overlap_min int, with_stats bool, sizes ...int) obiseq.IBioSequenceBatch {
|
||||
|
||||
nworkers := 7
|
||||
nworkers := runtime.NumCPU() - 1
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
@@ -148,13 +163,11 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
|
||||
processed := 0
|
||||
for i, A := range batch.Forward() {
|
||||
B := batch.Reverse()[i]
|
||||
cons[i] = AssemblePESequences(A, B, 2, 5, 20, true, arena, barena1, barena2)
|
||||
cons[i] = AssemblePESequences(A, B, 2, 5, 20, true, true, arena, barena1, barena2)
|
||||
if i%59 == 0 {
|
||||
bar.Add(59)
|
||||
processed += 59
|
||||
}
|
||||
A.Destroy()
|
||||
B.Destroy()
|
||||
}
|
||||
bar.Add(batch.Length() - processed)
|
||||
newIter.Channel() <- obiseq.MakeBioSequenceBatch(
|
||||
@@ -169,9 +182,10 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
|
||||
|
||||
log.Printf("Start of the sequence Pairing")
|
||||
|
||||
for i := 0; i < nworkers; i++ {
|
||||
for i := 0; i < nworkers-1; i++ {
|
||||
go f(iterator.Split(), i)
|
||||
}
|
||||
go f(iterator, nworkers-1)
|
||||
|
||||
return newIter
|
||||
|
||||
|
||||
@@ -15,6 +15,8 @@ var _AllowedMismatch = 0
|
||||
var _MinimumLength = 0
|
||||
var _MaximumLength = -1
|
||||
|
||||
// PCROptionSet adds to a command line option set every options
|
||||
// needed by the PCR algorithm.
|
||||
func PCROptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&_Circular, "circular", false,
|
||||
options.Alias("c"),
|
||||
@@ -40,11 +42,15 @@ func PCROptionSet(options *getoptions.GetOpt) {
|
||||
options.Description("Maximum length of the barcode (primers excluded)."))
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
// the obipcr command
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
PCROptionSet(options)
|
||||
}
|
||||
|
||||
// ForwardPrimer returns the sequence of the forward primer as indicated by the
|
||||
// --forward command line option
|
||||
func ForwardPrimer() string {
|
||||
pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch)
|
||||
|
||||
@@ -57,6 +63,8 @@ func ForwardPrimer() string {
|
||||
return _ForwardPrimer
|
||||
}
|
||||
|
||||
// ReversePrimer returns the sequence of the reverse primer as indicated by the
|
||||
// --reverse command line option
|
||||
func ReversePrimer() string {
|
||||
pattern, err := obiapat.MakeApatPattern(_ReversePrimer, _AllowedMismatch)
|
||||
|
||||
@@ -69,18 +77,27 @@ func ReversePrimer() string {
|
||||
return _ReversePrimer
|
||||
}
|
||||
|
||||
// AllowedMismatch returns the allowed mistmatch count between each
|
||||
// primer and the sequences as indicated by the
|
||||
// --allowed-mismatches|-e command line option
|
||||
func AllowedMismatch() int {
|
||||
return _AllowedMismatch
|
||||
}
|
||||
|
||||
// Circular returns the considered sequence topology as indicated by the
|
||||
// --circular|-c command line option
|
||||
func Circular() bool {
|
||||
return _Circular
|
||||
}
|
||||
|
||||
// MinLength returns the amplicon minimum length as indicated by the
|
||||
// --min-length|-l command line option
|
||||
func MinLength() int {
|
||||
return _MinimumLength
|
||||
}
|
||||
|
||||
// MaxLength returns the amplicon maximum length as indicated by the
|
||||
// --max-length|-L command line option
|
||||
func MaxLength() int {
|
||||
return _MaximumLength
|
||||
}
|
||||
|
||||
@@ -5,7 +5,10 @@ import (
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
func PCR(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequence, error) {
|
||||
// PCR iterates over sequences provided by a obiseq.IBioSequenceBatch
|
||||
// and returns an other obiseq.IBioSequenceBatch distributing
|
||||
// obiseq.BioSequenceBatch containing the selected amplicon sequences.
|
||||
func PCR(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequenceBatch, error) {
|
||||
|
||||
forward := ForwardPrimer()
|
||||
reverse := ReversePrimer()
|
||||
@@ -28,5 +31,5 @@ func PCR(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequence, error) {
|
||||
|
||||
worker := obiapat.PCRSliceWorker(forward, reverse, opts...)
|
||||
|
||||
return iterator.MakeISliceWorker(worker).IBioSequence(), nil
|
||||
return iterator.MakeISliceWorker(worker), nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user