Adds Aho-Corasick matching to obiannotate

This commit is contained in:
2023-02-09 15:59:11 +01:00
parent 8b70b1a5d8
commit 08521c74e2
6 changed files with 94 additions and 2 deletions

View File

@ -36,7 +36,7 @@ func main() {
sequences, _ := obiconvert.ReadBioSequences(args...)
annotator := obiannotate.CLIAnnotationPipeline()
obiconvert.WriteBioSequences(sequences.Pipe(annotator), true)
obiconvert.WriteBioSequences(sequences.Pipe(annotator).Speed(), true)
obiiter.WaitForLastPipe()

1
go.mod
View File

@ -23,6 +23,7 @@ require (
github.com/mattn/go-runewidth v0.0.13 // indirect
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/rrethy/ahocorasick v1.0.0 // indirect
github.com/vc60er/deptree v0.0.0-20220713110736-c48b0387dddc // indirect
github.com/yuin/goldmark v1.4.13 // indirect
golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 // indirect

2
go.sum
View File

@ -38,6 +38,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rrethy/ahocorasick v1.0.0 h1:YKkCB+E5PXc0xmLfMrWbfNht8vG9Re97IHSWZk/Lk8E=
github.com/rrethy/ahocorasick v1.0.0/go.mod h1:nq8oScE7Vy1rOppoQxpQiiDmPHuKCuk9rXrNcxUV3R0=
github.com/schollz/progressbar/v3 v3.8.6 h1:QruMUdzZ1TbEP++S1m73OqRJk20ON11m6Wqv4EoGg8c=
github.com/schollz/progressbar/v3 v3.8.6/go.mod h1:W5IEwbJecncFGBvuEh4A7HT1nZZ6WNIL2i3qbnI0WKY=
github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE=

45
pkg/obicorazick/worker.go Normal file
View File

@ -0,0 +1,45 @@
package obicorazick
import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"github.com/rrethy/ahocorasick"
)
func AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker {
matcher := ahocorasick.CompileStrings(patterns)
fslot := slot + "_Fwd"
rslot := slot + "_Rev"
f := func(s *obiseq.BioSequence) *obiseq.BioSequence {
matchesF := len(matcher.FindAllByteSlice(s.Sequence()))
matchesR := len(matcher.FindAllByteSlice(s.ReverseComplement(false).Sequence()))
log.Debugln("Macthes = ",matchesF,matchesR)
matches := matchesF + matchesR
if matches > 0 {
s.SetAttribute(slot, matches)
s.SetAttribute(fslot, matchesF)
s.SetAttribute(rslot, matchesR)
}
return s
}
return f
}
func AhoCorazickPredicate(minMatches int, patterns []string) obiseq.SequencePredicate {
matcher := ahocorasick.CompileStrings(patterns)
f := func(s *obiseq.BioSequence) bool {
matches := matcher.FindAllByteSlice(s.Sequence())
return len(matches) >= minMatches
}
return f
}

View File

@ -1,6 +1,9 @@
package obiannotate
import (
"log"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obicorazick"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
@ -118,6 +121,14 @@ func CLIAnnotationWorker() obiseq.SeqWorker {
annotator = annotator.ChainWorkers(w)
}
if CLIHasAhoCorasick() {
patterns := CLIAhoCorazick()
log.Println("Matching : ", len(patterns), " patterns on sequences")
w := obicorazick.AhoCorazickWorker("aho_corasick", patterns)
log.Println("Automata built")
annotator = annotator.ChainWorkers(w)
}
return annotator
}

View File

@ -1,6 +1,11 @@
package obiannotate
import (
"io/ioutil"
"log"
"os"
"strings"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep"
"github.com/DavidGamba/go-getoptions"
@ -15,6 +20,7 @@ var _tagList = ""
var _clearAll = false
var _setSeqLength = false
var _uniqueID = false
var _ahoCorazick = ""
func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
// options.BoolVar(&_addRank, "seq-rank", _addRank,
@ -29,6 +35,8 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
options.Description("Adds attribute with seq_length as a key and sequence length as a value."),
)
options.StringVar(&_ahoCorazick, "aho-corasick", _ahoCorazick,
options.Description("Adds an aho-corasick attribut with the count of matches of the provided patterns."))
// options.BoolVar(&_uniqueID, "uniq-id", _uniqueID,
// options.Description("Forces sequence record ids to be unique."),
// )
@ -130,4 +138,29 @@ func CLIHasSetLengthFlag() bool {
func CLIHasClearAllFlag() bool {
return _clearAll
}
}
func CLIHasAhoCorasick() bool {
_, err := os.Stat(_ahoCorazick)
return err == nil
}
func CLIAhoCorazick() []string {
content, err := ioutil.ReadFile(_ahoCorazick)
if err != nil {
log.Fatalln("Cannot open file ", _ahoCorazick)
}
lines := strings.Split(string(content), "\n")
j := 0
for _, s := range lines {
if len(s) > 0 {
lines[j] = strings.ToLower(s)
j++
}
}
lines = lines[0:j]
return lines
}