From a57cfda675690a8cf8701944d0542f79357da1ae Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 10 Apr 2025 15:17:15 +0200 Subject: [PATCH] Make the replace function of the eval language accepting regex --- pkg/obicorazick/worker.go | 62 +++++++++++++++++++++++++++++++--- pkg/obioptions/version.go | 2 +- pkg/obiseq/language.go | 5 ++- pkg/obiutils/cast_interface.go | 12 +++++++ 4 files changed, 75 insertions(+), 6 deletions(-) diff --git a/pkg/obicorazick/worker.go b/pkg/obicorazick/worker.go index e454f0e..a18791f 100644 --- a/pkg/obicorazick/worker.go +++ b/pkg/obicorazick/worker.go @@ -2,21 +2,75 @@ package obicorazick import ( log "github.com/sirupsen/logrus" - + "sync" + "os" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "github.com/rrethy/ahocorasick" + "github.com/schollz/progressbar/v3" ) func AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker { - matcher := ahocorasick.CompileStrings(patterns) + sizebatch:=10000000 + nmatcher := len(patterns) / sizebatch + 1 + log.Infof("Building AhoCorasick %d matcher for %d patterns in slot %s", + nmatcher, len(patterns), slot) + + if nmatcher == 0 { + log.Errorln("No patterns provided") + } + + matchers := make([]*ahocorasick.Matcher, nmatcher) + ieme := make(chan int) + mutex := &sync.WaitGroup{} + npar := min(obidefault.ParallelWorkers(), nmatcher) + mutex.Add(npar) + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowCount(), + progressbar.OptionShowIts(), + progressbar.OptionSetDescription("Building AhoCorasick matcher..."), + ) + + bar := progressbar.NewOptions(nmatcher, pbopt...) + bar.Add(0) + + builder := func() { + for i := range ieme { + matchers[i] = ahocorasick.CompileStrings(patterns[i*sizebatch:min((i+1)*sizebatch,len(patterns))]) + bar.Add(1) + } + mutex.Done() + } + + for i := 0; i < npar; i++ { + go builder() + } + + for i := 0; i < nmatcher; i++ { + ieme <- i + } + + close(ieme) + mutex.Wait() fslot := slot + "_Fwd" rslot := slot + "_Rev" f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { - matchesF := len(matcher.FindAllByteSlice(s.Sequence())) - matchesR := len(matcher.FindAllByteSlice(s.ReverseComplement(false).Sequence())) + matchesF := 0 + matchesR := 0 + b := s.Sequence() + bc := s.ReverseComplement(false).Sequence() + + for _, matcher := range matchers { + matchesF += len(matcher.FindAllByteSlice(b)) + matchesR += len(matcher.FindAllByteSlice(bc)) + } log.Debugln("Macthes = ", matchesF, matchesR) matches := matchesF + matchesR diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 40d5f6f..4415d4f 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "67e5b6e" +var _Commit = "0aec5ba" var _Version = "Release 4.4.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/language.go b/pkg/obiseq/language.go index a79c66a..9292092 100644 --- a/pkg/obiseq/language.go +++ b/pkg/obiseq/language.go @@ -3,6 +3,7 @@ package obiseq import ( "fmt" "reflect" + "regexp" "strings" log "github.com/sirupsen/logrus" @@ -204,6 +205,8 @@ var OBILang = gval.NewLanguage( return scomp, nil }), gval.Function("replace", func(args ...interface{}) (interface{}, error) { - return strings.ReplaceAll(args[0].(string), args[1].(string), args[2].(string)), nil + pattern := regexp.MustCompile(args[1].(string)) + results := pattern.ReplaceAllString(args[0].(string), args[2].(string)) + return results, nil }), ) diff --git a/pkg/obiutils/cast_interface.go b/pkg/obiutils/cast_interface.go index 690c8dc..f56c55a 100644 --- a/pkg/obiutils/cast_interface.go +++ b/pkg/obiutils/cast_interface.go @@ -3,6 +3,7 @@ package obiutils import ( "fmt" "reflect" + "strconv" log "github.com/sirupsen/logrus" ) @@ -125,6 +126,12 @@ func InterfaceToInt(i interface{}) (val int, err error) { val = int(t) // standardizes across systems case uint64: val = int(t) // standardizes across systems + case string: + rep, err := strconv.ParseInt(t, 10, 64) + if err != nil { + err = &NotAnFloat64{"value attribute cannot be casted to an int value"} + } + val = int(rep) default: err = &NotAnInteger{"value attribute cannot be casted to an integer"} } @@ -162,6 +169,11 @@ func InterfaceToFloat64(i interface{}) (val float64, err error) { val = float64(t) // standardizes across systems case uint64: val = float64(t) // standardizes across systems + case string: + val, err = strconv.ParseFloat(t, 10) + if err != nil { + err = &NotAnFloat64{"value attribute cannot be casted to a float value"} + } default: err = &NotAnFloat64{"value attribute cannot be casted to a float value"} }