From 08521c74e2adc8e622da41b2e6735e032aebec51 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 9 Feb 2023 15:59:11 +0100 Subject: [PATCH] Adds Aho-Corasick matching to obiannotate --- cmd/obitools/obiannotate/main.go | 2 +- go.mod | 1 + go.sum | 2 ++ pkg/obicorazick/worker.go | 45 +++++++++++++++++++++++++ pkg/obitools/obiannotate/obiannotate.go | 11 ++++++ pkg/obitools/obiannotate/options.go | 35 ++++++++++++++++++- 6 files changed, 94 insertions(+), 2 deletions(-) create mode 100644 pkg/obicorazick/worker.go diff --git a/cmd/obitools/obiannotate/main.go b/cmd/obitools/obiannotate/main.go index f652972..4d8bdb7 100644 --- a/cmd/obitools/obiannotate/main.go +++ b/cmd/obitools/obiannotate/main.go @@ -36,7 +36,7 @@ func main() { sequences, _ := obiconvert.ReadBioSequences(args...) annotator := obiannotate.CLIAnnotationPipeline() - obiconvert.WriteBioSequences(sequences.Pipe(annotator), true) + obiconvert.WriteBioSequences(sequences.Pipe(annotator).Speed(), true) obiiter.WaitForLastPipe() diff --git a/go.mod b/go.mod index 2f839dd..666587c 100644 --- a/go.mod +++ b/go.mod @@ -23,6 +23,7 @@ require ( github.com/mattn/go-runewidth v0.0.13 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/rivo/uniseg v0.2.0 // indirect + github.com/rrethy/ahocorasick v1.0.0 // indirect github.com/vc60er/deptree v0.0.0-20220713110736-c48b0387dddc // indirect github.com/yuin/goldmark v1.4.13 // indirect golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 // indirect diff --git a/go.sum b/go.sum index 163fac6..b1ed170 100644 --- a/go.sum +++ b/go.sum @@ -38,6 +38,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rrethy/ahocorasick v1.0.0 h1:YKkCB+E5PXc0xmLfMrWbfNht8vG9Re97IHSWZk/Lk8E= +github.com/rrethy/ahocorasick v1.0.0/go.mod h1:nq8oScE7Vy1rOppoQxpQiiDmPHuKCuk9rXrNcxUV3R0= github.com/schollz/progressbar/v3 v3.8.6 h1:QruMUdzZ1TbEP++S1m73OqRJk20ON11m6Wqv4EoGg8c= github.com/schollz/progressbar/v3 v3.8.6/go.mod h1:W5IEwbJecncFGBvuEh4A7HT1nZZ6WNIL2i3qbnI0WKY= github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE= diff --git a/pkg/obicorazick/worker.go b/pkg/obicorazick/worker.go new file mode 100644 index 0000000..1a27d6e --- /dev/null +++ b/pkg/obicorazick/worker.go @@ -0,0 +1,45 @@ +package obicorazick + +import ( + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "github.com/rrethy/ahocorasick" +) + +func AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker { + + matcher := ahocorasick.CompileStrings(patterns) + + fslot := slot + "_Fwd" + rslot := slot + "_Rev" + + f := func(s *obiseq.BioSequence) *obiseq.BioSequence { + matchesF := len(matcher.FindAllByteSlice(s.Sequence())) + matchesR := len(matcher.FindAllByteSlice(s.ReverseComplement(false).Sequence())) + + log.Debugln("Macthes = ",matchesF,matchesR) + matches := matchesF + matchesR + if matches > 0 { + s.SetAttribute(slot, matches) + s.SetAttribute(fslot, matchesF) + s.SetAttribute(rslot, matchesR) + } + + return s + } + + return f +} + +func AhoCorazickPredicate(minMatches int, patterns []string) obiseq.SequencePredicate { + + matcher := ahocorasick.CompileStrings(patterns) + + f := func(s *obiseq.BioSequence) bool { + matches := matcher.FindAllByteSlice(s.Sequence()) + return len(matches) >= minMatches + } + + return f +} diff --git a/pkg/obitools/obiannotate/obiannotate.go b/pkg/obitools/obiannotate/obiannotate.go index 41c5b10..de6b2d5 100644 --- a/pkg/obitools/obiannotate/obiannotate.go +++ b/pkg/obitools/obiannotate/obiannotate.go @@ -1,6 +1,9 @@ package obiannotate import ( + "log" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obicorazick" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax" @@ -118,6 +121,14 @@ func CLIAnnotationWorker() obiseq.SeqWorker { annotator = annotator.ChainWorkers(w) } + if CLIHasAhoCorasick() { + patterns := CLIAhoCorazick() + log.Println("Matching : ", len(patterns), " patterns on sequences") + w := obicorazick.AhoCorazickWorker("aho_corasick", patterns) + log.Println("Automata built") + annotator = annotator.ChainWorkers(w) + } + return annotator } diff --git a/pkg/obitools/obiannotate/options.go b/pkg/obitools/obiannotate/options.go index 66c781f..63480c1 100644 --- a/pkg/obitools/obiannotate/options.go +++ b/pkg/obitools/obiannotate/options.go @@ -1,6 +1,11 @@ package obiannotate import ( + "io/ioutil" + "log" + "os" + "strings" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep" "github.com/DavidGamba/go-getoptions" @@ -15,6 +20,7 @@ var _tagList = "" var _clearAll = false var _setSeqLength = false var _uniqueID = false +var _ahoCorazick = "" func SequenceAnnotationOptionSet(options *getoptions.GetOpt) { // options.BoolVar(&_addRank, "seq-rank", _addRank, @@ -29,6 +35,8 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) { options.Description("Adds attribute with seq_length as a key and sequence length as a value."), ) + options.StringVar(&_ahoCorazick, "aho-corasick", _ahoCorazick, + options.Description("Adds an aho-corasick attribut with the count of matches of the provided patterns.")) // options.BoolVar(&_uniqueID, "uniq-id", _uniqueID, // options.Description("Forces sequence record ids to be unique."), // ) @@ -130,4 +138,29 @@ func CLIHasSetLengthFlag() bool { func CLIHasClearAllFlag() bool { return _clearAll -} \ No newline at end of file +} + +func CLIHasAhoCorasick() bool { + _, err := os.Stat(_ahoCorazick) + return err == nil +} + +func CLIAhoCorazick() []string { + content, err := ioutil.ReadFile(_ahoCorazick) + if err != nil { + log.Fatalln("Cannot open file ", _ahoCorazick) + } + lines := strings.Split(string(content), "\n") + + j := 0 + for _, s := range lines { + if len(s) > 0 { + lines[j] = strings.ToLower(s) + j++ + } + } + + lines = lines[0:j] + + return lines +}