mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds Aho-Corasick matching to obiannotate
This commit is contained in:
@ -36,7 +36,7 @@ func main() {
|
||||
|
||||
sequences, _ := obiconvert.ReadBioSequences(args...)
|
||||
annotator := obiannotate.CLIAnnotationPipeline()
|
||||
obiconvert.WriteBioSequences(sequences.Pipe(annotator), true)
|
||||
obiconvert.WriteBioSequences(sequences.Pipe(annotator).Speed(), true)
|
||||
|
||||
obiiter.WaitForLastPipe()
|
||||
|
||||
|
1
go.mod
1
go.mod
@ -23,6 +23,7 @@ require (
|
||||
github.com/mattn/go-runewidth v0.0.13 // indirect
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
|
||||
github.com/rivo/uniseg v0.2.0 // indirect
|
||||
github.com/rrethy/ahocorasick v1.0.0 // indirect
|
||||
github.com/vc60er/deptree v0.0.0-20220713110736-c48b0387dddc // indirect
|
||||
github.com/yuin/goldmark v1.4.13 // indirect
|
||||
golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 // indirect
|
||||
|
2
go.sum
2
go.sum
@ -38,6 +38,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
|
||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/rrethy/ahocorasick v1.0.0 h1:YKkCB+E5PXc0xmLfMrWbfNht8vG9Re97IHSWZk/Lk8E=
|
||||
github.com/rrethy/ahocorasick v1.0.0/go.mod h1:nq8oScE7Vy1rOppoQxpQiiDmPHuKCuk9rXrNcxUV3R0=
|
||||
github.com/schollz/progressbar/v3 v3.8.6 h1:QruMUdzZ1TbEP++S1m73OqRJk20ON11m6Wqv4EoGg8c=
|
||||
github.com/schollz/progressbar/v3 v3.8.6/go.mod h1:W5IEwbJecncFGBvuEh4A7HT1nZZ6WNIL2i3qbnI0WKY=
|
||||
github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE=
|
||||
|
45
pkg/obicorazick/worker.go
Normal file
45
pkg/obicorazick/worker.go
Normal file
@ -0,0 +1,45 @@
|
||||
package obicorazick
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"github.com/rrethy/ahocorasick"
|
||||
)
|
||||
|
||||
func AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker {
|
||||
|
||||
matcher := ahocorasick.CompileStrings(patterns)
|
||||
|
||||
fslot := slot + "_Fwd"
|
||||
rslot := slot + "_Rev"
|
||||
|
||||
f := func(s *obiseq.BioSequence) *obiseq.BioSequence {
|
||||
matchesF := len(matcher.FindAllByteSlice(s.Sequence()))
|
||||
matchesR := len(matcher.FindAllByteSlice(s.ReverseComplement(false).Sequence()))
|
||||
|
||||
log.Debugln("Macthes = ",matchesF,matchesR)
|
||||
matches := matchesF + matchesR
|
||||
if matches > 0 {
|
||||
s.SetAttribute(slot, matches)
|
||||
s.SetAttribute(fslot, matchesF)
|
||||
s.SetAttribute(rslot, matchesR)
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func AhoCorazickPredicate(minMatches int, patterns []string) obiseq.SequencePredicate {
|
||||
|
||||
matcher := ahocorasick.CompileStrings(patterns)
|
||||
|
||||
f := func(s *obiseq.BioSequence) bool {
|
||||
matches := matcher.FindAllByteSlice(s.Sequence())
|
||||
return len(matches) >= minMatches
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
@ -1,6 +1,9 @@
|
||||
package obiannotate
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obicorazick"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
||||
@ -118,6 +121,14 @@ func CLIAnnotationWorker() obiseq.SeqWorker {
|
||||
annotator = annotator.ChainWorkers(w)
|
||||
}
|
||||
|
||||
if CLIHasAhoCorasick() {
|
||||
patterns := CLIAhoCorazick()
|
||||
log.Println("Matching : ", len(patterns), " patterns on sequences")
|
||||
w := obicorazick.AhoCorazickWorker("aho_corasick", patterns)
|
||||
log.Println("Automata built")
|
||||
annotator = annotator.ChainWorkers(w)
|
||||
}
|
||||
|
||||
return annotator
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,11 @@
|
||||
package obiannotate
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
@ -15,6 +20,7 @@ var _tagList = ""
|
||||
var _clearAll = false
|
||||
var _setSeqLength = false
|
||||
var _uniqueID = false
|
||||
var _ahoCorazick = ""
|
||||
|
||||
func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
|
||||
// options.BoolVar(&_addRank, "seq-rank", _addRank,
|
||||
@ -29,6 +35,8 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
|
||||
options.Description("Adds attribute with seq_length as a key and sequence length as a value."),
|
||||
)
|
||||
|
||||
options.StringVar(&_ahoCorazick, "aho-corasick", _ahoCorazick,
|
||||
options.Description("Adds an aho-corasick attribut with the count of matches of the provided patterns."))
|
||||
// options.BoolVar(&_uniqueID, "uniq-id", _uniqueID,
|
||||
// options.Description("Forces sequence record ids to be unique."),
|
||||
// )
|
||||
@ -130,4 +138,29 @@ func CLIHasSetLengthFlag() bool {
|
||||
|
||||
func CLIHasClearAllFlag() bool {
|
||||
return _clearAll
|
||||
}
|
||||
}
|
||||
|
||||
func CLIHasAhoCorasick() bool {
|
||||
_, err := os.Stat(_ahoCorazick)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func CLIAhoCorazick() []string {
|
||||
content, err := ioutil.ReadFile(_ahoCorazick)
|
||||
if err != nil {
|
||||
log.Fatalln("Cannot open file ", _ahoCorazick)
|
||||
}
|
||||
lines := strings.Split(string(content), "\n")
|
||||
|
||||
j := 0
|
||||
for _, s := range lines {
|
||||
if len(s) > 0 {
|
||||
lines[j] = strings.ToLower(s)
|
||||
j++
|
||||
}
|
||||
}
|
||||
|
||||
lines = lines[0:j]
|
||||
|
||||
return lines
|
||||
}
|
||||
|
Reference in New Issue
Block a user