mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds Aho-Corasick matching to obiannotate
This commit is contained in:
45
pkg/obicorazick/worker.go
Normal file
45
pkg/obicorazick/worker.go
Normal file
@ -0,0 +1,45 @@
|
||||
package obicorazick
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"github.com/rrethy/ahocorasick"
|
||||
)
|
||||
|
||||
func AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker {
|
||||
|
||||
matcher := ahocorasick.CompileStrings(patterns)
|
||||
|
||||
fslot := slot + "_Fwd"
|
||||
rslot := slot + "_Rev"
|
||||
|
||||
f := func(s *obiseq.BioSequence) *obiseq.BioSequence {
|
||||
matchesF := len(matcher.FindAllByteSlice(s.Sequence()))
|
||||
matchesR := len(matcher.FindAllByteSlice(s.ReverseComplement(false).Sequence()))
|
||||
|
||||
log.Debugln("Macthes = ",matchesF,matchesR)
|
||||
matches := matchesF + matchesR
|
||||
if matches > 0 {
|
||||
s.SetAttribute(slot, matches)
|
||||
s.SetAttribute(fslot, matchesF)
|
||||
s.SetAttribute(rslot, matchesR)
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func AhoCorazickPredicate(minMatches int, patterns []string) obiseq.SequencePredicate {
|
||||
|
||||
matcher := ahocorasick.CompileStrings(patterns)
|
||||
|
||||
f := func(s *obiseq.BioSequence) bool {
|
||||
matches := matcher.FindAllByteSlice(s.Sequence())
|
||||
return len(matches) >= minMatches
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
@ -1,6 +1,9 @@
|
||||
package obiannotate
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obicorazick"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
||||
@ -118,6 +121,14 @@ func CLIAnnotationWorker() obiseq.SeqWorker {
|
||||
annotator = annotator.ChainWorkers(w)
|
||||
}
|
||||
|
||||
if CLIHasAhoCorasick() {
|
||||
patterns := CLIAhoCorazick()
|
||||
log.Println("Matching : ", len(patterns), " patterns on sequences")
|
||||
w := obicorazick.AhoCorazickWorker("aho_corasick", patterns)
|
||||
log.Println("Automata built")
|
||||
annotator = annotator.ChainWorkers(w)
|
||||
}
|
||||
|
||||
return annotator
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,11 @@
|
||||
package obiannotate
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
@ -15,6 +20,7 @@ var _tagList = ""
|
||||
var _clearAll = false
|
||||
var _setSeqLength = false
|
||||
var _uniqueID = false
|
||||
var _ahoCorazick = ""
|
||||
|
||||
func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
|
||||
// options.BoolVar(&_addRank, "seq-rank", _addRank,
|
||||
@ -29,6 +35,8 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
|
||||
options.Description("Adds attribute with seq_length as a key and sequence length as a value."),
|
||||
)
|
||||
|
||||
options.StringVar(&_ahoCorazick, "aho-corasick", _ahoCorazick,
|
||||
options.Description("Adds an aho-corasick attribut with the count of matches of the provided patterns."))
|
||||
// options.BoolVar(&_uniqueID, "uniq-id", _uniqueID,
|
||||
// options.Description("Forces sequence record ids to be unique."),
|
||||
// )
|
||||
@ -130,4 +138,29 @@ func CLIHasSetLengthFlag() bool {
|
||||
|
||||
func CLIHasClearAllFlag() bool {
|
||||
return _clearAll
|
||||
}
|
||||
}
|
||||
|
||||
func CLIHasAhoCorasick() bool {
|
||||
_, err := os.Stat(_ahoCorazick)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func CLIAhoCorazick() []string {
|
||||
content, err := ioutil.ReadFile(_ahoCorazick)
|
||||
if err != nil {
|
||||
log.Fatalln("Cannot open file ", _ahoCorazick)
|
||||
}
|
||||
lines := strings.Split(string(content), "\n")
|
||||
|
||||
j := 0
|
||||
for _, s := range lines {
|
||||
if len(s) > 0 {
|
||||
lines[j] = strings.ToLower(s)
|
||||
j++
|
||||
}
|
||||
}
|
||||
|
||||
lines = lines[0:j]
|
||||
|
||||
return lines
|
||||
}
|
||||
|
Reference in New Issue
Block a user