mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-10 17:50:26 +00:00
Add option related to agrep match on obigrep and obiannotate
This commit is contained in:
@@ -25,7 +25,7 @@ func DeleteAttributesWorker(toBeDeleted []string) obiseq.SeqWorker {
|
||||
return f
|
||||
}
|
||||
|
||||
func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) obiseq.SeqWorker {
|
||||
func MatchPatternWorker(pattern, name string, errormax int, bothStrand, allowsIndel bool) obiseq.SeqWorker {
|
||||
pat, err := obiapat.MakeApatPattern(pattern, errormax, allowsIndel)
|
||||
if err != nil {
|
||||
log.Fatalf("error in compiling pattern (%s) : %v", pattern, err)
|
||||
@@ -56,7 +56,7 @@ func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) ob
|
||||
|
||||
start, end, nerr, matched := pat.BestMatch(apats, 0, s.Len())
|
||||
|
||||
if matched {
|
||||
if matched && start >= 0 && end <= s.Len() {
|
||||
annot := s.Annotations()
|
||||
annot[slot] = pattern
|
||||
|
||||
@@ -75,7 +75,7 @@ func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) ob
|
||||
} else {
|
||||
start, end, nerr, matched := cpat.BestMatch(apats, 0, s.Len())
|
||||
|
||||
if matched {
|
||||
if matched && start >= 0 && end <= s.Len() {
|
||||
annot := s.Annotations()
|
||||
annot[slot] = pattern
|
||||
match, err := s.Subsequence(start, end, false)
|
||||
@@ -328,9 +328,10 @@ func CLIAnnotationWorker() obiseq.SeqWorker {
|
||||
}
|
||||
|
||||
if CLIHasPattern() {
|
||||
log.Infof("Match pattern %s with %d error", CLIPattern(), CLIPatternError())
|
||||
log.Infof("Match pattern %s with %d error", CLIPattern(), obigrep.CLIPatternError())
|
||||
w := MatchPatternWorker(CLIPattern(), CLIHasPatternName(),
|
||||
CLIPatternError(), CLIPatternInDels())
|
||||
obigrep.CLIPatternError(), obigrep.CLIPatternBothStrand(),
|
||||
obigrep.CLIPatternInDels())
|
||||
|
||||
annotator = annotator.ChainWorkers(w)
|
||||
}
|
||||
|
||||
@@ -24,8 +24,6 @@ var _setSeqLength = false
|
||||
var _uniqueID = false
|
||||
var _ahoCorazick = ""
|
||||
var _pattern = ""
|
||||
var _pattern_error = 0
|
||||
var _pattern_indel = false
|
||||
var _pattern_name = "pattern"
|
||||
var _lcaSlot = ""
|
||||
var _lcaError = 0.0
|
||||
@@ -62,14 +60,6 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
|
||||
options.Description("specify the name to use as prefix for the slots reporting the match"),
|
||||
)
|
||||
|
||||
options.IntVar(&_pattern_error, "pattern-error", _pattern_error,
|
||||
options.Description("Maximum number of allowed error during pattern matching"),
|
||||
)
|
||||
|
||||
options.BoolVar(&_pattern_indel, "allows-indels", _pattern_indel,
|
||||
options.Description("Allows for indel during pattern matching"),
|
||||
)
|
||||
|
||||
options.StringVar(&_lcaSlot, "add-lca-in", _lcaSlot,
|
||||
options.ArgName("SLOT_NAME"),
|
||||
options.Description("From the taxonomic annotation of the sequence (taxid slot or merged_taxid slot), "+
|
||||
@@ -304,14 +294,6 @@ func CLIHasPatternName() string {
|
||||
return _pattern_name
|
||||
}
|
||||
|
||||
func CLIPatternError() int {
|
||||
return _pattern_error
|
||||
}
|
||||
|
||||
func CLIPatternInDels() bool {
|
||||
return _pattern_indel
|
||||
}
|
||||
|
||||
func CLISetTaxonomicPath() bool {
|
||||
return _taxonomicPath
|
||||
}
|
||||
|
||||
@@ -19,6 +19,8 @@ var __input_fastobi_format__ = false
|
||||
var __input_ecopcr_format__ = false
|
||||
var __input_embl_format__ = false
|
||||
var __input_genbank_format__ = false
|
||||
var __input_fastq_format__ = false
|
||||
var __input_fasta_format__ = false
|
||||
|
||||
var __output_in_fasta__ = false
|
||||
var __output_in_fastq__ = false
|
||||
@@ -56,6 +58,12 @@ func InputOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&__input_genbank_format__, "genbank", __input_genbank_format__,
|
||||
options.Description("Read data following the Genbank flatfile format."))
|
||||
|
||||
options.BoolVar(&__input_fastq_format__, "fastq", __input_fastq_format__,
|
||||
options.Description("Read data following the fastq format."))
|
||||
|
||||
options.BoolVar(&__input_fasta_format__, "fasta", __input_fasta_format__,
|
||||
options.Description("Read data following the fasta format."))
|
||||
|
||||
options.BoolVar(&__no_ordered_input__, "no-order", __no_ordered_input__,
|
||||
options.Description("When several input files are provided, "+
|
||||
"indicates that there is no order among them."))
|
||||
@@ -116,6 +124,10 @@ func OptionSet(options *getoptions.GetOpt) {
|
||||
// file has to be printed.
|
||||
func CLIInputFormat() string {
|
||||
switch {
|
||||
case __input_fasta_format__:
|
||||
return "fasta"
|
||||
case __input_fastq_format__:
|
||||
return "fastq"
|
||||
case __input_ecopcr_format__:
|
||||
return "ecopcr"
|
||||
case __input_embl_format__:
|
||||
|
||||
@@ -138,6 +138,10 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
||||
}
|
||||
|
||||
switch CLIInputFormat() {
|
||||
case "fastq":
|
||||
reader = obiformats.ReadFastqFromFile
|
||||
case "fasta":
|
||||
reader = obiformats.ReadFastaFromFile
|
||||
case "ecopcr":
|
||||
reader = obiformats.ReadEcoPCRFromFile
|
||||
case "embl":
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats/ncbitaxdump"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
@@ -43,6 +44,11 @@ var _SaveRejected = ""
|
||||
|
||||
var _PairedMode = "forward"
|
||||
|
||||
var _approx_pattern = make([]string, 0)
|
||||
var _pattern_error = 0
|
||||
var _pattern_indel = false
|
||||
var _pattern_only_forward = false
|
||||
|
||||
func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
options.StringVar(&_Taxdump, "taxdump", _Taxdump,
|
||||
@@ -143,6 +149,23 @@ func SequenceSelectionOptionSet(options *getoptions.GetOpt) {
|
||||
options.Description("If paired reads are passed to obibrep, that option determines how the conditions "+
|
||||
"are applied to both reads."),
|
||||
)
|
||||
|
||||
options.StringSliceVar(&_approx_pattern, "approx-pattern", 1, 1,
|
||||
options.ArgName("PATTERN"),
|
||||
options.Description("Regular expression pattern to be tested against the sequence itself. The pattern is case insensitive."))
|
||||
|
||||
options.IntVar(&_pattern_error, "pattern-error", _pattern_error,
|
||||
options.Description("Maximum number of allowed error during pattern matching"),
|
||||
)
|
||||
|
||||
options.BoolVar(&_pattern_indel, "allows-indels", _pattern_indel,
|
||||
options.Description("Allows for indel during pattern matching"),
|
||||
)
|
||||
|
||||
options.BoolVar(&_pattern_only_forward, "only-forward", _pattern_only_forward,
|
||||
options.Description("Look for pattern only on forward strand"),
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
@@ -212,6 +235,18 @@ func CLISelectedNCBITaxDump() string {
|
||||
return _Taxdump
|
||||
}
|
||||
|
||||
func CLIPatternError() int {
|
||||
return _pattern_error
|
||||
}
|
||||
|
||||
func CLIPatternInDels() bool {
|
||||
return _pattern_indel
|
||||
}
|
||||
|
||||
func CLIPatternBothStrand() bool {
|
||||
return !_pattern_only_forward
|
||||
}
|
||||
|
||||
func CLILoadSelectedTaxonomy() *obitax.Taxonomy {
|
||||
if CLISelectedNCBITaxDump() != "" {
|
||||
if _Taxonomy == nil {
|
||||
@@ -327,6 +362,30 @@ func CLISequencePatternPredicate() obiseq.SequencePredicate {
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLISequenceAgrep() obiseq.SequencePredicate {
|
||||
if len(_approx_pattern) > 0 {
|
||||
p := obiapat.IsPatternMatchSequence(
|
||||
_approx_pattern[0],
|
||||
CLIPatternError(),
|
||||
CLIPatternBothStrand(),
|
||||
CLIPatternInDels(),
|
||||
)
|
||||
|
||||
for _, pattern := range _approx_pattern[1:] {
|
||||
p = p.And(obiapat.IsPatternMatchSequence(
|
||||
pattern,
|
||||
CLIPatternError(),
|
||||
CLIPatternBothStrand(),
|
||||
CLIPatternInDels(),
|
||||
))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIDefinitionPatternPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_DefinitionPatterns) > 0 {
|
||||
@@ -419,6 +478,7 @@ func CLISequenceSelectionPredicate() obiseq.SequencePredicate {
|
||||
p = p.And(CLIIdListPredicate())
|
||||
p = p.And(CLIHasAttibutePredicate())
|
||||
p = p.And(CLIIsAttibuteMatchPredicate())
|
||||
p = p.And(CLISequenceAgrep())
|
||||
|
||||
if _InvertMatch {
|
||||
p = p.Not()
|
||||
|
||||
Reference in New Issue
Block a user