Change the API of workers

Former-commit-id: 9b07306edd8cf28266f86f95823948fa99d39ea9
This commit is contained in:
2024-03-02 16:03:46 -04:00
parent 4a0b20484f
commit 0f3871d203
19 changed files with 194 additions and 120 deletions

View File

@ -2,13 +2,26 @@
## Latest changes ## Latest changes
### API Changes
- Two of the main class `obiseq.SeqWorker` and `obiseq.SeqWorker` have their declaration changed.
Both now return two values a `obiseq.BioSequenceSlice` and an `error`. This allow a worker to
return potentially several sequences as the result of the processing of a single sequence, or
zero, which is equivalent to filter out the input sequence.
### Enhancement ### Enhancement
- The bug corrected in the parsing of EMBL and Genbank files as implemented in version 4.1.2 of OBITools4, - The bug corrected in the parsing of EMBL and Genbank files as implemented in version 4.1.2 of OBITools4,
potentially induced some reduction in the performance of the parsing. This should have been now fixed. potentially induced some reduction in the performance of the parsing. This should have been now fixed.
- In the same idea, parsing of genbank and EMBL files were reading and storing in memory not only the sequence - In the same idea, parsing of genbank and EMBL files were reading and storing in memory not only
the sequence
but also the annotations (features table). Up to now none of the obitools are using this information, but but also the annotations (features table). Up to now none of the obitools are using this information, but
with large complete genomes, it is occupying a lot of memory. To reduce this impact, the new version of the with large complete genomes, it is occupying a lot of memory. To reduce this impact, the new version of
parser doesn't any more store in memory the annotations by default. the parser doesn't any more store in memory the annotations by default.
- Add a **--taxonomic-path** to `obiannotate`. The option adds a `taxonomic_path` tag to sequences describing
the taxonomic classification of the sequence according to its taxid. The path is a string. Each level of the
path is delimited by a `|` character. A level consists of three parts separated by a `@`. The first part is the
taxid, the second the scientific name and the last the taxonomic rank. The first level described is always the
root of the taxonomy. The latest corresponds to the taxid of the sequence. If a sequence is not annotated by
a taxid, as usual the sequence is assumed having the taxid 1 (the root of the taxonomy).
## February 16th, 2024. Release 4.1.2 ## February 16th, 2024. Release 4.1.2

View File

@ -24,7 +24,7 @@ func main() {
os.Exit(1) os.Exit(1)
} }
comp := fs.MakeIWorker(obiseq.ReverseComplementWorker(true)) comp := fs.MakeIWorker(obiseq.ReverseComplementWorker(true), true)
obiconvert.CLIWriteBioSequences(comp, true) obiconvert.CLIWriteBioSequences(comp, true)
obiiter.WaitForLastPipe() obiiter.WaitForLastPipe()

View File

@ -524,10 +524,10 @@ func PCRSlice(sequences obiseq.BioSequenceSlice,
func PCRSliceWorker(options ...WithOption) obiseq.SeqSliceWorker { func PCRSliceWorker(options ...WithOption) obiseq.SeqSliceWorker {
opt := MakeOptions(options) opt := MakeOptions(options)
worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice { worker := func(sequences obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) {
result := _PCRSlice(sequences, opt) result := _PCRSlice(sequences, opt)
sequences.Recycle(true) sequences.Recycle(true)
return result return result, nil
} }
return worker return worker

View File

@ -14,7 +14,7 @@ func AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker {
fslot := slot + "_Fwd" fslot := slot + "_Fwd"
rslot := slot + "_Rev" rslot := slot + "_Rev"
f := func(s *obiseq.BioSequence) *obiseq.BioSequence { f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
matchesF := len(matcher.FindAllByteSlice(s.Sequence())) matchesF := len(matcher.FindAllByteSlice(s.Sequence()))
matchesR := len(matcher.FindAllByteSlice(s.ReverseComplement(false).Sequence())) matchesR := len(matcher.FindAllByteSlice(s.ReverseComplement(false).Sequence()))
@ -26,7 +26,7 @@ func AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker {
s.SetAttribute(rslot, matchesR) s.SetAttribute(rslot, matchesR)
} }
return s return obiseq.BioSequenceSlice{s}, nil
} }
return f return f

View File

@ -31,5 +31,6 @@ func IParseFastSeqHeaderBatch(iterator obiiter.IBioSequence,
options ...WithOption) obiiter.IBioSequence { options ...WithOption) obiiter.IBioSequence {
opt := MakeOptions(options) opt := MakeOptions(options)
return iterator.MakeIWorker(obiseq.AnnotatorToSeqWorker(opt.ParseFastSeqHeader()), return iterator.MakeIWorker(obiseq.AnnotatorToSeqWorker(opt.ParseFastSeqHeader()),
false,
opt.ParallelWorkers()) opt.ParallelWorkers())
} }

View File

@ -15,7 +15,9 @@ import (
// Moreover the SeqWorker function, the method accepted two optional integer parameters. // Moreover the SeqWorker function, the method accepted two optional integer parameters.
// - First is allowing to indicates the number of workers running in parallele (default 4) // - First is allowing to indicates the number of workers running in parallele (default 4)
// - The second the size of the chanel buffer. By default set to the same value than the input buffer. // - The second the size of the chanel buffer. By default set to the same value than the input buffer.
func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int) IBioSequence { func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker,
breakOnError bool,
sizes ...int) IBioSequence {
nworkers := obioptions.CLIParallelWorkers() nworkers := obioptions.CLIParallelWorkers()
if len(sizes) > 0 { if len(sizes) > 0 {
@ -32,11 +34,15 @@ func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int)
}() }()
sw := obiseq.SeqToSliceWorker(worker, true, breakOnError)
f := func(iterator IBioSequence) { f := func(iterator IBioSequence) {
var err error
for iterator.Next() { for iterator.Next() {
batch := iterator.Get() batch := iterator.Get()
for i, seq := range batch.slice { batch.slice, err = sw(batch.slice)
batch.slice[i] = worker(seq) if err != nil && breakOnError {
log.Fatalf("Error on sequence processing : %v", err)
} }
newIter.Push(batch) newIter.Push(batch)
} }
@ -67,7 +73,7 @@ func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int)
// Return: // Return:
// - newIter: A new IBioSequence iterator with the modified sequences. // - newIter: A new IBioSequence iterator with the modified sequences.
func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePredicate, func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePredicate,
worker obiseq.SeqWorker, sizes ...int) IBioSequence { worker obiseq.SeqWorker, breakOnError bool, sizes ...int) IBioSequence {
nworkers := obioptions.CLIReadParallelWorkers() nworkers := obioptions.CLIReadParallelWorkers()
if len(sizes) > 0 { if len(sizes) > 0 {
@ -84,13 +90,15 @@ func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePre
}() }()
sw := obiseq.SeqToSliceConditionalWorker(predicate, worker, true, breakOnError)
f := func(iterator IBioSequence) { f := func(iterator IBioSequence) {
var err error
for iterator.Next() { for iterator.Next() {
batch := iterator.Get() batch := iterator.Get()
for i, seq := range batch.slice { batch.slice, err = sw(batch.slice)
if predicate(batch.slice[i]) { if err != nil && breakOnError {
batch.slice[i] = worker(seq) log.Fatalf("Error on sequence processing : %v", err)
}
} }
newIter.Push(batch) newIter.Push(batch)
} }
@ -120,7 +128,7 @@ func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePre
// provided, the default number of workers is used. // provided, the default number of workers is used.
// //
// The function returns a new IBioSequence containing the modified slices. // The function returns a new IBioSequence containing the modified slices.
func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, sizes ...int) IBioSequence { func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, breakOnError bool, sizes ...int) IBioSequence {
nworkers := obioptions.CLIParallelWorkers() nworkers := obioptions.CLIParallelWorkers()
if len(sizes) > 0 { if len(sizes) > 0 {
@ -137,9 +145,13 @@ func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, size
}() }()
f := func(iterator IBioSequence) { f := func(iterator IBioSequence) {
var err error
for iterator.Next() { for iterator.Next() {
batch := iterator.Get() batch := iterator.Get()
batch.slice = worker(batch.slice) batch.slice, err = worker(batch.slice)
if err != nil && breakOnError {
log.Fatalf("Error on sequence processing : %v", err)
}
newIter.Push(batch) newIter.Push(batch)
} }
newIter.Done() newIter.Done()
@ -169,9 +181,9 @@ func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, size
// //
// Return: // Return:
// - f: A Pipeable object that represents the closure created by the WorkerPipe function. // - f: A Pipeable object that represents the closure created by the WorkerPipe function.
func WorkerPipe(worker obiseq.SeqWorker, sizes ...int) Pipeable { func WorkerPipe(worker obiseq.SeqWorker, breakOnError bool, sizes ...int) Pipeable {
f := func(iterator IBioSequence) IBioSequence { f := func(iterator IBioSequence) IBioSequence {
return iterator.MakeIWorker(worker, sizes...) return iterator.MakeIWorker(worker, breakOnError, sizes...)
} }
return f return f
@ -182,9 +194,9 @@ func WorkerPipe(worker obiseq.SeqWorker, sizes ...int) Pipeable {
// The worker parameter is the SeqSliceWorker to be applied. // The worker parameter is the SeqSliceWorker to be applied.
// The sizes parameter is a variadic parameter representing the sizes of the slices. // The sizes parameter is a variadic parameter representing the sizes of the slices.
// The function returns a Pipeable function that applies the SeqSliceWorker to the iterator. // The function returns a Pipeable function that applies the SeqSliceWorker to the iterator.
func SliceWorkerPipe(worker obiseq.SeqSliceWorker, sizes ...int) Pipeable { func SliceWorkerPipe(worker obiseq.SeqSliceWorker, breakOnError bool, sizes ...int) Pipeable {
f := func(iterator IBioSequence) IBioSequence { f := func(iterator IBioSequence) IBioSequence {
return iterator.MakeISliceWorker(worker, sizes...) return iterator.MakeISliceWorker(worker, breakOnError, sizes...)
} }
return f return f

View File

@ -172,8 +172,8 @@ func ExtractBarcodeSliceWorker(ngslibrary NGSLibrary,
ngslibrary.Compile(opt.AllowedMismatch(), opt.AllowsIndel()) ngslibrary.Compile(opt.AllowedMismatch(), opt.AllowsIndel())
worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice { worker := func(sequences obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) {
return _ExtractBarcodeSlice(ngslibrary, sequences, opt) return _ExtractBarcodeSlice(ngslibrary, sequences, opt), nil
} }
return worker return worker

View File

@ -28,16 +28,18 @@ func Expression(expression string) func(*BioSequence) (interface{}, error) {
func EditIdWorker(expression string) SeqWorker { func EditIdWorker(expression string) SeqWorker {
e := Expression(expression) e := Expression(expression)
f := func(sequence *BioSequence) *BioSequence { f := func(sequence *BioSequence) (BioSequenceSlice, error) {
v, err := e(sequence) v, err := e(sequence)
if err == nil {
if err != nil { sequence.SetId(fmt.Sprintf("%v", v))
log.Fatalf("Expression '%s' cannot be evaluated on sequence %s", } else {
err = fmt.Errorf("Expression '%s' cannot be evaluated on sequence %s : %v",
expression, expression,
sequence.Id()) sequence.Id(),
err)
} }
sequence.SetId(fmt.Sprintf("%v", v))
return sequence return BioSequenceSlice{sequence}, err
} }
return f return f
@ -45,16 +47,18 @@ func EditIdWorker(expression string) SeqWorker {
func EditAttributeWorker(key string, expression string) SeqWorker { func EditAttributeWorker(key string, expression string) SeqWorker {
e := Expression(expression) e := Expression(expression)
f := func(sequence *BioSequence) *BioSequence { f := func(sequence *BioSequence) (BioSequenceSlice, error) {
v, err := e(sequence) v, err := e(sequence)
if err == nil {
if err != nil { sequence.SetAttribute(key, v)
log.Fatalf("Expression '%s' cannot be evaluated on sequence %s", } else {
err = fmt.Errorf("Expression '%s' cannot be evaluated on sequence %s : %v",
expression, expression,
sequence.Id()) sequence.Id(),
err)
} }
sequence.SetAttribute(key, v)
return sequence return BioSequenceSlice{sequence}, err
} }
return f return f

View File

@ -105,8 +105,8 @@ func (sequence *BioSequence) _revcmpMutation() *BioSequence {
* @returns {SeqWorker} A function that accepts *BioSequence and returns its reversed-complement form. * @returns {SeqWorker} A function that accepts *BioSequence and returns its reversed-complement form.
*/ */
func ReverseComplementWorker(inplace bool) SeqWorker { func ReverseComplementWorker(inplace bool) SeqWorker {
f := func(input *BioSequence) *BioSequence { f := func(input *BioSequence) (BioSequenceSlice, error) {
return input.ReverseComplement(inplace) return BioSequenceSlice{input.ReverseComplement(inplace)}, nil
} }
return f return f

View File

@ -1,20 +1,25 @@
package obiseq package obiseq
import log "github.com/sirupsen/logrus" import (
"fmt"
"slices"
log "github.com/sirupsen/logrus"
)
type SeqAnnotator func(*BioSequence) type SeqAnnotator func(*BioSequence)
type SeqWorker func(*BioSequence) *BioSequence type SeqWorker func(*BioSequence) (BioSequenceSlice, error)
type SeqSliceWorker func(BioSequenceSlice) BioSequenceSlice type SeqSliceWorker func(BioSequenceSlice) (BioSequenceSlice, error)
func NilSeqWorker(seq *BioSequence) *BioSequence { func NilSeqWorker(seq *BioSequence) (BioSequenceSlice, error) {
return seq return BioSequenceSlice{seq}, nil
} }
func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker { func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
f := func(seq *BioSequence) *BioSequence { f := func(seq *BioSequence) (BioSequenceSlice, error) {
function(seq) function(seq)
return seq return BioSequenceSlice{seq}, nil
} }
return f return f
} }
@ -25,35 +30,47 @@ func SeqToSliceWorker(worker SeqWorker,
if worker == nil { if worker == nil {
if inplace { if inplace {
f = func(input BioSequenceSlice) BioSequenceSlice { f = func(input BioSequenceSlice) (BioSequenceSlice, error) {
return input return input, nil
} }
} else { } else {
f = func(input BioSequenceSlice) BioSequenceSlice { f = func(input BioSequenceSlice) (BioSequenceSlice, error) {
output := MakeBioSequenceSlice(len(input)) output := MakeBioSequenceSlice(len(input))
copy(output, input) copy(output, input)
return output return output, nil
} }
} }
} else { } else {
f = func(input BioSequenceSlice) BioSequenceSlice { f = func(input BioSequenceSlice) (BioSequenceSlice, error) {
output := input output := input
if !inplace { if !inplace {
output = MakeBioSequenceSlice(len(input)) output = MakeBioSequenceSlice(len(input))
} }
i := 0 i := 0
for _, s := range input { for _, s := range input {
r := worker(s) r, err := worker(s)
if r != nil { if err == nil {
output[i] = r for _, rs := range r {
i++ output[i] = rs
} else if breakOnError { i++
log.Fatalf("got an error on sequence %s processing", if i == cap(output) {
r.Id()) slices.Grow(output, cap(output))
}
}
} else {
if breakOnError {
err = fmt.Errorf("got an error on sequence %s processing : %v",
s.Id(), err)
return BioSequenceSlice{}, err
} else {
log.Warnf("got an error on sequence %s processing",
s.Id())
}
} }
} }
return output[0:i] return output[0:i], nil
} }
} }
@ -61,15 +78,16 @@ func SeqToSliceWorker(worker SeqWorker,
return f return f
} }
func SeqToSliceConditionalWorker(worker SeqWorker, func SeqToSliceConditionalWorker(
condition SequencePredicate, condition SequencePredicate,
worker SeqWorker,
inplace, breakOnError bool) SeqSliceWorker { inplace, breakOnError bool) SeqSliceWorker {
if condition == nil { if condition == nil {
return SeqToSliceWorker(worker, inplace, breakOnError) return SeqToSliceWorker(worker, inplace, breakOnError)
} }
f := func(input BioSequenceSlice) BioSequenceSlice { f := func(input BioSequenceSlice) (BioSequenceSlice, error) {
output := input output := input
if !inplace { if !inplace {
output = MakeBioSequenceSlice(len(input)) output = MakeBioSequenceSlice(len(input))
@ -79,18 +97,29 @@ func SeqToSliceConditionalWorker(worker SeqWorker,
for _, s := range input { for _, s := range input {
if condition(s) { if condition(s) {
r := worker(s) r, err := worker(s)
if r != nil { if err == nil {
output[i] = r for _, rs := range r {
i++ output[i] = rs
} else if breakOnError { i++
log.Fatalf("got an error on sequence %s processing", if i == cap(output) {
r.Id()) slices.Grow(output, cap(output))
}
}
} else {
if breakOnError {
err = fmt.Errorf("got an error on sequence %s processing : %v",
s.Id(), err)
return BioSequenceSlice{}, err
} else {
log.Warnf("got an error on sequence %s processing",
s.Id())
}
} }
} }
} }
return output[0:i] return output[0:i], nil
} }
return f return f
@ -105,11 +134,17 @@ func (worker SeqWorker) ChainWorkers(next SeqWorker) SeqWorker {
} }
} }
f := func(seq *BioSequence) *BioSequence { sw := SeqToSliceWorker(next, true, false)
f := func(seq *BioSequence) (BioSequenceSlice, error) {
if seq == nil { if seq == nil {
return nil return BioSequenceSlice{}, nil
} }
return next(worker(seq)) slice, err := worker(seq)
if err == nil {
slice, err = sw(slice)
}
return slice, err
} }
return f return f

View File

@ -147,14 +147,14 @@ func AddLCAWorker(taxonomy *Taxonomy, slot_name string, threshold float64) obise
lca_name = "scientific_name" lca_name = "scientific_name"
} }
f := func(sequence *obiseq.BioSequence) *obiseq.BioSequence { f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
lca, rans, _ := taxonomy.LCA(sequence, threshold) lca, rans, _ := taxonomy.LCA(sequence, threshold)
sequence.SetAttribute(slot_name, lca.Taxid()) sequence.SetAttribute(slot_name, lca.Taxid())
sequence.SetAttribute(lca_name, lca.ScientificName()) sequence.SetAttribute(lca_name, lca.ScientificName())
sequence.SetAttribute(lca_error, math.Round((1-rans)*1000)/1000) sequence.SetAttribute(lca_error, math.Round((1-rans)*1000)/1000)
return sequence return obiseq.BioSequenceSlice{sequence}, nil
} }
return f return f

View File

@ -14,9 +14,9 @@ func (taxonomy *Taxonomy) MakeSetTaxonAtRankWorker(rank string) obiseq.SeqWorker
taxonomy.RankList()) taxonomy.RankList())
} }
w := func(sequence *obiseq.BioSequence) *obiseq.BioSequence { w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
taxonomy.SetTaxonAtRank(sequence, rank) taxonomy.SetTaxonAtRank(sequence, rank)
return sequence return obiseq.BioSequenceSlice{sequence}, nil
} }
return w return w
@ -24,9 +24,9 @@ func (taxonomy *Taxonomy) MakeSetTaxonAtRankWorker(rank string) obiseq.SeqWorker
func (taxonomy *Taxonomy) MakeSetSpeciesWorker() obiseq.SeqWorker { func (taxonomy *Taxonomy) MakeSetSpeciesWorker() obiseq.SeqWorker {
w := func(sequence *obiseq.BioSequence) *obiseq.BioSequence { w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
taxonomy.SetSpecies(sequence) taxonomy.SetSpecies(sequence)
return sequence return obiseq.BioSequenceSlice{sequence}, nil
} }
return w return w
@ -34,9 +34,9 @@ func (taxonomy *Taxonomy) MakeSetSpeciesWorker() obiseq.SeqWorker {
func (taxonomy *Taxonomy) MakeSetGenusWorker() obiseq.SeqWorker { func (taxonomy *Taxonomy) MakeSetGenusWorker() obiseq.SeqWorker {
w := func(sequence *obiseq.BioSequence) *obiseq.BioSequence { w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
taxonomy.SetGenus(sequence) taxonomy.SetGenus(sequence)
return sequence return obiseq.BioSequenceSlice{sequence}, nil
} }
return w return w
@ -44,9 +44,9 @@ func (taxonomy *Taxonomy) MakeSetGenusWorker() obiseq.SeqWorker {
func (taxonomy *Taxonomy) MakeSetFamilyWorker() obiseq.SeqWorker { func (taxonomy *Taxonomy) MakeSetFamilyWorker() obiseq.SeqWorker {
w := func(sequence *obiseq.BioSequence) *obiseq.BioSequence { w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
taxonomy.SetFamily(sequence) taxonomy.SetFamily(sequence)
return sequence return obiseq.BioSequenceSlice{sequence}, nil
} }
return w return w
@ -54,9 +54,9 @@ func (taxonomy *Taxonomy) MakeSetFamilyWorker() obiseq.SeqWorker {
func (taxonomy *Taxonomy) MakeSetPathWorker() obiseq.SeqWorker { func (taxonomy *Taxonomy) MakeSetPathWorker() obiseq.SeqWorker {
w := func(s *obiseq.BioSequence) *obiseq.BioSequence { w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
taxonomy.SetPath(s) taxonomy.SetPath(sequence)
return s return obiseq.BioSequenceSlice{sequence}, nil
} }
return w return w

View File

@ -15,11 +15,11 @@ import (
) )
func DeleteAttributesWorker(toBeDeleted []string) obiseq.SeqWorker { func DeleteAttributesWorker(toBeDeleted []string) obiseq.SeqWorker {
f := func(s *obiseq.BioSequence) *obiseq.BioSequence { f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
for _, k := range toBeDeleted { for _, k := range toBeDeleted {
s.DeleteAttribute(k) s.DeleteAttribute(k)
} }
return s return obiseq.BioSequenceSlice{s}, nil
} }
return f return f
@ -48,7 +48,7 @@ func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) ob
slot_error := fmt.Sprintf("%s_error", name) slot_error := fmt.Sprintf("%s_error", name)
slot_location := fmt.Sprintf("%s_location", name) slot_location := fmt.Sprintf("%s_location", name)
f := func(s *obiseq.BioSequence) *obiseq.BioSequence { f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
apats, err := obiapat.MakeApatSequence(s, false) apats, err := obiapat.MakeApatSequence(s, false)
if err != nil { if err != nil {
log.Fatalf("error in preparing sequence %s : %v", s.Id(), err) log.Fatalf("error in preparing sequence %s : %v", s.Id(), err)
@ -59,6 +59,11 @@ func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) ob
if matched { if matched {
annot := s.Annotations() annot := s.Annotations()
annot[slot] = pattern annot[slot] = pattern
if start < 0 {
start = 0
}
match, err := s.Subsequence(start, end, false) match, err := s.Subsequence(start, end, false)
if err != nil { if err != nil {
log.Fatalf("Error in extracting pattern of sequence %s [%d;%d[ : %v", log.Fatalf("Error in extracting pattern of sequence %s [%d;%d[ : %v",
@ -83,7 +88,7 @@ func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) ob
annot[slot_location] = fmt.Sprintf("complement(%d..%d)", start+1, end) annot[slot_location] = fmt.Sprintf("complement(%d..%d)", start+1, end)
} }
} }
return s return obiseq.BioSequenceSlice{s}, nil
} }
return f return f
@ -97,14 +102,14 @@ func ToBeKeptAttributesWorker(toBeKept []string) obiseq.SeqWorker {
d[v] = true d[v] = true
} }
f := func(s *obiseq.BioSequence) *obiseq.BioSequence { f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
annot := s.Annotations() annot := s.Annotations()
for key := range annot { for key := range annot {
if _, ok := d[key]; !ok { if _, ok := d[key]; !ok {
s.DeleteAttribute(key) s.DeleteAttribute(key)
} }
} }
return s return obiseq.BioSequenceSlice{s}, nil
} }
return f return f
@ -112,7 +117,7 @@ func ToBeKeptAttributesWorker(toBeKept []string) obiseq.SeqWorker {
func CutSequenceWorker(from, to int, breakOnError bool) obiseq.SeqWorker { func CutSequenceWorker(from, to int, breakOnError bool) obiseq.SeqWorker {
f := func(s *obiseq.BioSequence) *obiseq.BioSequence { f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
var f, t int var f, t int
switch { switch {
@ -142,16 +147,15 @@ func CutSequenceWorker(from, to int, breakOnError bool) obiseq.SeqWorker {
if breakOnError { if breakOnError {
log.Fatalf("Cannot cut sequence %s (%v)", s.Id(), err) log.Fatalf("Cannot cut sequence %s (%v)", s.Id(), err)
} else { } else {
log.Warnf("Cannot cut sequence %s (%v), sequence discarded", s.Id(), err) err = fmt.Errorf("Cannot cut sequence %s (%v), sequence discarded", s.Id(), err)
return nil
} }
} }
return rep return obiseq.BioSequenceSlice{rep}, err
} }
if from == 0 && to == 0 { if from == 0 && to == 0 {
f = func(s *obiseq.BioSequence) *obiseq.BioSequence { f = func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
return s return obiseq.BioSequenceSlice{s}, nil
} }
} }
@ -163,23 +167,23 @@ func CutSequenceWorker(from, to int, breakOnError bool) obiseq.SeqWorker {
} }
func ClearAllAttributesWorker() obiseq.SeqWorker { func ClearAllAttributesWorker() obiseq.SeqWorker {
f := func(s *obiseq.BioSequence) *obiseq.BioSequence { f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
annot := s.Annotations() annot := s.Annotations()
for key := range annot { for key := range annot {
s.DeleteAttribute(key) s.DeleteAttribute(key)
} }
return s return obiseq.BioSequenceSlice{s}, nil
} }
return f return f
} }
func RenameAttributeWorker(toBeRenamed map[string]string) obiseq.SeqWorker { func RenameAttributeWorker(toBeRenamed map[string]string) obiseq.SeqWorker {
f := func(s *obiseq.BioSequence) *obiseq.BioSequence { f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
for newName, oldName := range toBeRenamed { for newName, oldName := range toBeRenamed {
s.RenameAttribute(newName, oldName) s.RenameAttribute(newName, oldName)
} }
return s return obiseq.BioSequenceSlice{s}, nil
} }
return f return f
@ -201,20 +205,20 @@ func EvalAttributeWorker(expression map[string]string) obiseq.SeqWorker {
} }
func AddTaxonAtRankWorker(taxonomy *obitax.Taxonomy, ranks ...string) obiseq.SeqWorker { func AddTaxonAtRankWorker(taxonomy *obitax.Taxonomy, ranks ...string) obiseq.SeqWorker {
f := func(s *obiseq.BioSequence) *obiseq.BioSequence { f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
for _, r := range ranks { for _, r := range ranks {
taxonomy.SetTaxonAtRank(s, r) taxonomy.SetTaxonAtRank(s, r)
} }
return s return obiseq.BioSequenceSlice{s}, nil
} }
return f return f
} }
func AddSeqLengthWorker() obiseq.SeqWorker { func AddSeqLengthWorker() obiseq.SeqWorker {
f := func(s *obiseq.BioSequence) *obiseq.BioSequence { f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
s.SetAttribute("seq_length", s.Len()) s.SetAttribute("seq_length", s.Len())
return s return obiseq.BioSequenceSlice{s}, nil
} }
return f return f
@ -309,8 +313,8 @@ func CLIAnnotationPipeline() obiiter.Pipeable {
predicate := obigrep.CLISequenceSelectionPredicate() predicate := obigrep.CLISequenceSelectionPredicate()
worker := CLIAnnotationWorker() worker := CLIAnnotationWorker()
annotator := obiseq.SeqToSliceConditionalWorker(worker, predicate, true, false) annotator := obiseq.SeqToSliceConditionalWorker(predicate, worker, true, false)
f := obiiter.SliceWorkerPipe(annotator, obioptions.CLIParallelWorkers()) f := obiiter.SliceWorkerPipe(annotator, false, obioptions.CLIParallelWorkers())
return f return f
} }

View File

@ -60,7 +60,7 @@ func annotateOBIClean(dataset obiseq.BioSequenceSlice,
sample map[string]*([]*seqPCR), sample map[string]*([]*seqPCR),
tag, NAValue string) obiiter.IBioSequence { tag, NAValue string) obiiter.IBioSequence {
batchsize := 1000 batchsize := 1000
var annot = func(data obiseq.BioSequenceSlice) obiseq.BioSequenceSlice { var annot = func(data obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) {
for _, s := range data { for _, s := range data {
status := Status(s) status := Status(s)
@ -87,11 +87,11 @@ func annotateOBIClean(dataset obiseq.BioSequenceSlice,
annotation["obiclean_samplecount"] = head + internal + singleton annotation["obiclean_samplecount"] = head + internal + singleton
} }
return data return data, nil
} }
iter := obiiter.IBatchOver(dataset, batchsize) iter := obiiter.IBatchOver(dataset, batchsize)
riter := iter.MakeISliceWorker(annot) riter := iter.MakeISliceWorker(annot, false)
return riter return riter
} }

View File

@ -50,10 +50,13 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
obioptions.CLIParallelWorkers()) obioptions.CLIParallelWorkers())
annotated := usable.MakeIWorker(taxonomy.MakeSetSpeciesWorker(), annotated := usable.MakeIWorker(taxonomy.MakeSetSpeciesWorker(),
false,
obioptions.CLIParallelWorkers(), obioptions.CLIParallelWorkers(),
).MakeIWorker(taxonomy.MakeSetGenusWorker(), ).MakeIWorker(taxonomy.MakeSetGenusWorker(),
false,
obioptions.CLIParallelWorkers(), obioptions.CLIParallelWorkers(),
).MakeIWorker(taxonomy.MakeSetFamilyWorker(), ).MakeIWorker(taxonomy.MakeSetFamilyWorker(),
false,
obioptions.CLIParallelWorkers(), obioptions.CLIParallelWorkers(),
) )

View File

@ -30,7 +30,7 @@ func IExtractBarcode(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error
worker := obingslibrary.ExtractBarcodeSliceWorker(ngsfilter, opts...) worker := obingslibrary.ExtractBarcodeSliceWorker(ngsfilter, opts...)
newIter := iterator.MakeISliceWorker(worker) newIter := iterator.MakeISliceWorker(worker, false)
if !CLIConservedErrors() { if !CLIConservedErrors() {
log.Println("Discards unassigned sequences") log.Println("Discards unassigned sequences")

View File

@ -60,5 +60,5 @@ func CLIPCR(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error) {
iterator = iterator.Pipe(frags) iterator = iterator.Pipe(frags)
} }
return iterator.MakeISliceWorker(worker, obioptions.CLIParallelWorkers(), 0), nil return iterator.MakeISliceWorker(worker, false, obioptions.CLIParallelWorkers(), 0), nil
} }

View File

@ -1,9 +1,10 @@
package obitag package obitag
import ( import (
log "github.com/sirupsen/logrus"
"math" "math"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
@ -190,9 +191,10 @@ func GeomIdentifySeqWorker(references *obiseq.BioSequenceSlice,
landmarks := ExtractLandmarkSeqs(references) landmarks := ExtractLandmarkSeqs(references)
taxa := ExtractTaxonSet(references, taxo) taxa := ExtractTaxonSet(references, taxo)
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence { return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
buffer := make([]uint64, 100) buffer := make([]uint64, 100)
return GeomIdentify(sequence, landmarks, references, taxa, taxo, &buffer) return obiseq.BioSequenceSlice{GeomIdentify(sequence, landmarks, references, taxa, taxo, &buffer)},
nil
} }
} }
@ -202,5 +204,5 @@ func CLIGeomAssignTaxonomy(iterator obiiter.IBioSequence,
) obiiter.IBioSequence { ) obiiter.IBioSequence {
worker := GeomIdentifySeqWorker(&references, taxo) worker := GeomIdentifySeqWorker(&references, taxo)
return iterator.MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0) return iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers(), 0)
} }

View File

@ -259,8 +259,8 @@ func IdentifySeqWorker(references obiseq.BioSequenceSlice,
taxa obitax.TaxonSet, taxa obitax.TaxonSet,
taxo *obitax.Taxonomy, taxo *obitax.Taxonomy,
runExact bool) obiseq.SeqWorker { runExact bool) obiseq.SeqWorker {
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence { return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
return Identify(sequence, references, refcounts, taxa, taxo, runExact) return obiseq.BioSequenceSlice{Identify(sequence, references, refcounts, taxa, taxo, runExact)}, nil
} }
} }
@ -285,5 +285,5 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact()) worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact())
return iterator.MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0) return iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers(), 0)
} }