Add option related to agrep match on obigrep and obiannotate

This commit is contained in:
Eric Coissac
2024-09-09 16:52:13 +02:00
parent 65ae82622e
commit 05bf2bfd6c
12 changed files with 167 additions and 34 deletions

View File

@ -2,6 +2,12 @@
## Latest changes ## Latest changes
### New features
- Most of the time obitools identify automatically sequence file format. But
it fails sometimes. Two new option **--fasta** and **--fastq** are added to
allow the processing of the rare fasta and fastq files not recognized.
## August 2nd, 2024. Release 4.3.0 ## August 2nd, 2024. Release 4.3.0
### Change of git repositiory ### Change of git repositiory

View File

@ -25,17 +25,18 @@ func buffIndex(i, j, width int) int {
// //
// The function returns the start and end positions of the best // The function returns the start and end positions of the best
// match, as well as the number of errors in the best match. // match, as well as the number of errors in the best match.
func LocatePattern(pattern, sequence []byte) (int, int, int) { func LocatePattern(id string, pattern, sequence []byte) (int, int, int) {
if len(pattern) >= len(sequence) {
log.Panicf("Sequence %s:Pattern %s must be shorter than sequence %s", id, pattern, sequence)
}
// Pattern spreads over the columns // Pattern spreads over the columns
// Sequence spreads over the rows // Sequence spreads over the rows
width := len(pattern) + 1 width := len(pattern) + 1
buffsize := (len(pattern) + 1) * (len(sequence) + 1) buffsize := (len(pattern) + 1) * (len(sequence) + 1)
buffer := make([]int, buffsize) buffer := make([]int, buffsize)
if len(pattern) >= len(sequence) {
log.Panicf("Pattern %s must be shorter than sequence %s", pattern, sequence)
}
// The path matrix keeps track of the best path through the matrix // The path matrix keeps track of the best path through the matrix
// 0 : indicate the diagonal path // 0 : indicate the diagonal path
// 1 : indicate the up path // 1 : indicate the up path

View File

@ -297,6 +297,24 @@ func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, begin, length int
return loc return loc
} }
func (pattern ApatPattern) IsMatching(sequence ApatSequence, begin, length int) bool {
if begin < 0 {
begin = 0
}
if length < 0 {
length = sequence.Len()
}
nhits := int(C.ManberAll(sequence.pointer.pointer,
pattern.pointer.pointer,
0,
C.int32_t(begin),
C.int32_t(length+C.MAX_PAT_LEN)))
return nhits > 0
}
// BestMatch finds the best match of a given pattern in a sequence. // BestMatch finds the best match of a given pattern in a sequence.
// //
// THe function identify the first occurrence of the pattern in the sequence. // THe function identify the first occurrence of the pattern in the sequence.
@ -336,6 +354,11 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (
nerr = best[2] nerr = best[2]
end = best[1] end = best[1]
if best[0] < 0 || best[1] > sequence.Len() {
matched = false
return
}
if nerr == 0 || !pattern.pointer.pointer.hasIndel { if nerr == 0 || !pattern.pointer.pointer.hasIndel {
start = best[0] start = best[0]
log.Debugln("No nws ", start, nerr) log.Debugln("No nws ", start, nerr)
@ -356,14 +379,16 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (
best[0], nerr, int(pattern.pointer.pointer.patlen), best[0], nerr, int(pattern.pointer.pointer.patlen),
sequence.Len(), start, end) sequence.Len(), start, end)
from, to, score := obialign.LocatePattern((*cpattern)[0:int(pattern.pointer.pointer.patlen)], frg) from, to, score := obialign.LocatePattern(sequence.pointer.reference.Id(),
(*cpattern)[0:int(pattern.pointer.pointer.patlen)],
frg)
// olderr := m[2] // olderr := m[2]
nerr = score nerr = score
start = start + from start = start + from
end = start + to end = start + to
log.Debugln("results", score, start, nerr) log.Debugf("BestMatch on %s : score=%d [%d..%d]", sequence.pointer.reference.Id(), score, start, nerr)
return return
} }
@ -454,7 +479,10 @@ func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int)
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat)) cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
frg := sequence.pointer.reference.Sequence()[start:end] frg := sequence.pointer.reference.Sequence()[start:end]
pb, pe, score := obialign.LocatePattern((*cpattern)[0:int(pattern.pointer.pointer.patlen)], frg) pb, pe, score := obialign.LocatePattern(
sequence.pointer.reference.Id(),
(*cpattern)[0:int(pattern.pointer.pointer.patlen)],
frg)
// olderr := m[2] // olderr := m[2]
m[2] = score m[2] = score

40
pkg/obiapat/predicat.go Normal file
View File

@ -0,0 +1,40 @@
package obiapat
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
log "github.com/sirupsen/logrus"
)
func IsPatternMatchSequence(pattern string, errormax int, bothStrand, allowIndels bool) obiseq.SequencePredicate {
pat, err := MakeApatPattern(pattern, errormax, allowIndels)
if err != nil {
log.Fatalf("error in sequence regular pattern syntax : %v", err)
}
cpat, err := pat.ReverseComplement()
if err != nil {
log.Fatalf("cannot reverse complement the pattern : %v", err)
}
f := func(sequence *obiseq.BioSequence) bool {
aseq, err := MakeApatSequence(sequence, false)
if err != nil {
log.Panicf("Cannot convert sequence %s to apat format", sequence.Id())
}
match := pat.IsMatching(aseq, 0, aseq.Len())
if !match && bothStrand {
match = cpat.IsMatching(aseq, 0, aseq.Len())
}
return match
}
return f
}

View File

@ -107,7 +107,7 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
mimetype.Lookup("application/octet-stream").Extend(csv, "text/csv", ".csv") mimetype.Lookup("application/octet-stream").Extend(csv, "text/csv", ".csv")
// Create a buffer to store the read data // Create a buffer to store the read data
buf := make([]byte, 1024*128) buf := make([]byte, 1024*1024)
n, err := io.ReadFull(stream, buf) n, err := io.ReadFull(stream, buf)
if err != nil && err != io.ErrUnexpectedEOF { if err != nil && err != io.ErrUnexpectedEOF {

View File

@ -7,7 +7,7 @@ import (
// TODO: The version number is extracted from git. This induces that the version // TODO: The version number is extracted from git. This induces that the version
// corresponds to the last commit, and not the one when the file will be // corresponds to the last commit, and not the one when the file will be
// commited // commited
var _Commit = "373464c" var _Commit = "65ae826"
var _Version = "Release 4.2.0" var _Version = "Release 4.2.0"
// Version returns the version of the obitools package. // Version returns the version of the obitools package.

View File

@ -209,7 +209,6 @@ func IsSequenceMatch(pattern string) SequencePredicate {
return f return f
} }
func IsDefinitionMatch(pattern string) SequencePredicate { func IsDefinitionMatch(pattern string) SequencePredicate {
pat, err := regexp.Compile(pattern) pat, err := regexp.Compile(pattern)

View File

@ -25,7 +25,7 @@ func DeleteAttributesWorker(toBeDeleted []string) obiseq.SeqWorker {
return f return f
} }
func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) obiseq.SeqWorker { func MatchPatternWorker(pattern, name string, errormax int, bothStrand, allowsIndel bool) obiseq.SeqWorker {
pat, err := obiapat.MakeApatPattern(pattern, errormax, allowsIndel) pat, err := obiapat.MakeApatPattern(pattern, errormax, allowsIndel)
if err != nil { if err != nil {
log.Fatalf("error in compiling pattern (%s) : %v", pattern, err) log.Fatalf("error in compiling pattern (%s) : %v", pattern, err)
@ -56,7 +56,7 @@ func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) ob
start, end, nerr, matched := pat.BestMatch(apats, 0, s.Len()) start, end, nerr, matched := pat.BestMatch(apats, 0, s.Len())
if matched { if matched && start >= 0 && end <= s.Len() {
annot := s.Annotations() annot := s.Annotations()
annot[slot] = pattern annot[slot] = pattern
@ -75,7 +75,7 @@ func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) ob
} else { } else {
start, end, nerr, matched := cpat.BestMatch(apats, 0, s.Len()) start, end, nerr, matched := cpat.BestMatch(apats, 0, s.Len())
if matched { if matched && start >= 0 && end <= s.Len() {
annot := s.Annotations() annot := s.Annotations()
annot[slot] = pattern annot[slot] = pattern
match, err := s.Subsequence(start, end, false) match, err := s.Subsequence(start, end, false)
@ -328,9 +328,10 @@ func CLIAnnotationWorker() obiseq.SeqWorker {
} }
if CLIHasPattern() { if CLIHasPattern() {
log.Infof("Match pattern %s with %d error", CLIPattern(), CLIPatternError()) log.Infof("Match pattern %s with %d error", CLIPattern(), obigrep.CLIPatternError())
w := MatchPatternWorker(CLIPattern(), CLIHasPatternName(), w := MatchPatternWorker(CLIPattern(), CLIHasPatternName(),
CLIPatternError(), CLIPatternInDels()) obigrep.CLIPatternError(), obigrep.CLIPatternBothStrand(),
obigrep.CLIPatternInDels())
annotator = annotator.ChainWorkers(w) annotator = annotator.ChainWorkers(w)
} }

View File

@ -24,8 +24,6 @@ var _setSeqLength = false
var _uniqueID = false var _uniqueID = false
var _ahoCorazick = "" var _ahoCorazick = ""
var _pattern = "" var _pattern = ""
var _pattern_error = 0
var _pattern_indel = false
var _pattern_name = "pattern" var _pattern_name = "pattern"
var _lcaSlot = "" var _lcaSlot = ""
var _lcaError = 0.0 var _lcaError = 0.0
@ -62,14 +60,6 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
options.Description("specify the name to use as prefix for the slots reporting the match"), options.Description("specify the name to use as prefix for the slots reporting the match"),
) )
options.IntVar(&_pattern_error, "pattern-error", _pattern_error,
options.Description("Maximum number of allowed error during pattern matching"),
)
options.BoolVar(&_pattern_indel, "allows-indels", _pattern_indel,
options.Description("Allows for indel during pattern matching"),
)
options.StringVar(&_lcaSlot, "add-lca-in", _lcaSlot, options.StringVar(&_lcaSlot, "add-lca-in", _lcaSlot,
options.ArgName("SLOT_NAME"), options.ArgName("SLOT_NAME"),
options.Description("From the taxonomic annotation of the sequence (taxid slot or merged_taxid slot), "+ options.Description("From the taxonomic annotation of the sequence (taxid slot or merged_taxid slot), "+
@ -304,14 +294,6 @@ func CLIHasPatternName() string {
return _pattern_name return _pattern_name
} }
func CLIPatternError() int {
return _pattern_error
}
func CLIPatternInDels() bool {
return _pattern_indel
}
func CLISetTaxonomicPath() bool { func CLISetTaxonomicPath() bool {
return _taxonomicPath return _taxonomicPath
} }

View File

@ -19,6 +19,8 @@ var __input_fastobi_format__ = false
var __input_ecopcr_format__ = false var __input_ecopcr_format__ = false
var __input_embl_format__ = false var __input_embl_format__ = false
var __input_genbank_format__ = false var __input_genbank_format__ = false
var __input_fastq_format__ = false
var __input_fasta_format__ = false
var __output_in_fasta__ = false var __output_in_fasta__ = false
var __output_in_fastq__ = false var __output_in_fastq__ = false
@ -56,6 +58,12 @@ func InputOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__input_genbank_format__, "genbank", __input_genbank_format__, options.BoolVar(&__input_genbank_format__, "genbank", __input_genbank_format__,
options.Description("Read data following the Genbank flatfile format.")) options.Description("Read data following the Genbank flatfile format."))
options.BoolVar(&__input_fastq_format__, "fastq", __input_fastq_format__,
options.Description("Read data following the fastq format."))
options.BoolVar(&__input_fasta_format__, "fasta", __input_fasta_format__,
options.Description("Read data following the fasta format."))
options.BoolVar(&__no_ordered_input__, "no-order", __no_ordered_input__, options.BoolVar(&__no_ordered_input__, "no-order", __no_ordered_input__,
options.Description("When several input files are provided, "+ options.Description("When several input files are provided, "+
"indicates that there is no order among them.")) "indicates that there is no order among them."))
@ -116,6 +124,10 @@ func OptionSet(options *getoptions.GetOpt) {
// file has to be printed. // file has to be printed.
func CLIInputFormat() string { func CLIInputFormat() string {
switch { switch {
case __input_fasta_format__:
return "fasta"
case __input_fastq_format__:
return "fastq"
case __input_ecopcr_format__: case __input_ecopcr_format__:
return "ecopcr" return "ecopcr"
case __input_embl_format__: case __input_embl_format__:

View File

@ -138,6 +138,10 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
} }
switch CLIInputFormat() { switch CLIInputFormat() {
case "fastq":
reader = obiformats.ReadFastqFromFile
case "fasta":
reader = obiformats.ReadFastaFromFile
case "ecopcr": case "ecopcr":
reader = obiformats.ReadEcoPCRFromFile reader = obiformats.ReadEcoPCRFromFile
case "embl": case "embl":

View File

@ -6,6 +6,7 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats/ncbitaxdump" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats/ncbitaxdump"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
@ -43,6 +44,11 @@ var _SaveRejected = ""
var _PairedMode = "forward" var _PairedMode = "forward"
var _approx_pattern = make([]string, 0)
var _pattern_error = 0
var _pattern_indel = false
var _pattern_only_forward = false
func TaxonomySelectionOptionSet(options *getoptions.GetOpt) { func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_Taxdump, "taxdump", _Taxdump, options.StringVar(&_Taxdump, "taxdump", _Taxdump,
@ -143,6 +149,23 @@ func SequenceSelectionOptionSet(options *getoptions.GetOpt) {
options.Description("If paired reads are passed to obibrep, that option determines how the conditions "+ options.Description("If paired reads are passed to obibrep, that option determines how the conditions "+
"are applied to both reads."), "are applied to both reads."),
) )
options.StringSliceVar(&_approx_pattern, "approx-pattern", 1, 1,
options.ArgName("PATTERN"),
options.Description("Regular expression pattern to be tested against the sequence itself. The pattern is case insensitive."))
options.IntVar(&_pattern_error, "pattern-error", _pattern_error,
options.Description("Maximum number of allowed error during pattern matching"),
)
options.BoolVar(&_pattern_indel, "allows-indels", _pattern_indel,
options.Description("Allows for indel during pattern matching"),
)
options.BoolVar(&_pattern_only_forward, "only-forward", _pattern_only_forward,
options.Description("Look for pattern only on forward strand"),
)
} }
// OptionSet adds to the basic option set every options declared for // OptionSet adds to the basic option set every options declared for
@ -212,6 +235,18 @@ func CLISelectedNCBITaxDump() string {
return _Taxdump return _Taxdump
} }
func CLIPatternError() int {
return _pattern_error
}
func CLIPatternInDels() bool {
return _pattern_indel
}
func CLIPatternBothStrand() bool {
return !_pattern_only_forward
}
func CLILoadSelectedTaxonomy() *obitax.Taxonomy { func CLILoadSelectedTaxonomy() *obitax.Taxonomy {
if CLISelectedNCBITaxDump() != "" { if CLISelectedNCBITaxDump() != "" {
if _Taxonomy == nil { if _Taxonomy == nil {
@ -327,6 +362,30 @@ func CLISequencePatternPredicate() obiseq.SequencePredicate {
return nil return nil
} }
func CLISequenceAgrep() obiseq.SequencePredicate {
if len(_approx_pattern) > 0 {
p := obiapat.IsPatternMatchSequence(
_approx_pattern[0],
CLIPatternError(),
CLIPatternBothStrand(),
CLIPatternInDels(),
)
for _, pattern := range _approx_pattern[1:] {
p = p.And(obiapat.IsPatternMatchSequence(
pattern,
CLIPatternError(),
CLIPatternBothStrand(),
CLIPatternInDels(),
))
}
return p
}
return nil
}
func CLIDefinitionPatternPredicate() obiseq.SequencePredicate { func CLIDefinitionPatternPredicate() obiseq.SequencePredicate {
if len(_DefinitionPatterns) > 0 { if len(_DefinitionPatterns) > 0 {
@ -419,6 +478,7 @@ func CLISequenceSelectionPredicate() obiseq.SequencePredicate {
p = p.And(CLIIdListPredicate()) p = p.And(CLIIdListPredicate())
p = p.And(CLIHasAttibutePredicate()) p = p.And(CLIHasAttibutePredicate())
p = p.And(CLIIsAttibuteMatchPredicate()) p = p.And(CLIIsAttibuteMatchPredicate())
p = p.And(CLISequenceAgrep())
if _InvertMatch { if _InvertMatch {
p = p.Not() p = p.Not()