Files
obitools4/pkg/obiapat/pattern.go
Eric Coissac 3f69fa41d6 Patch a bug for multiple amplicon per sequence.
Former-commit-id: b252d2de8e1a85d65c2951aa1958ee038e35741d
2023-03-31 15:10:25 +02:00

360 lines
10 KiB
Go

package obiapat
/*
#cgo CFLAGS: -g -Wall
#include <stdlib.h>
#include "obiapat.h"
*/
import "C"
import (
"errors"
"runtime"
"unsafe"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
var _MaxPatLen = int(C.MAX_PAT_LEN)
// var _AllocatedApaSequences = int64(0)
var _AllocatedApaPattern = 0
// ApatPattern stores a regular pattern usable by the
// Apat algorithm functions and methods
type _ApatPattern struct {
pointer *C.Pattern
pattern string
}
type ApatPattern struct {
pointer *_ApatPattern
}
// ApatSequence stores sequence in structure usable by the
// Apat algorithm functions and methods
type _ApatSequence struct {
pointer *C.Seq
reference *obiseq.BioSequence
}
type ApatSequence struct {
pointer *_ApatSequence
}
// NilApatPattern is the nil instance of the BuildAlignArena
// type.
var NilApatPattern = ApatPattern{nil}
// NilApatSequence is the nil instance of the ApatSequence
// type.
var NilApatSequence = ApatSequence{nil}
// MakeApatPattern builds a new ApatPattern.
// The created object wrap a C allocated structure.
// Do not forget to free it when it is no more needed
// to forbid memory leaks using the Free methode of the
// ApatPattern.
// The pattern is a short DNA sequence (up to 64 symboles).
// Ambiguities can be represented or using UIPAC symboles,
// or using the [...] classical in regular pattern grammar.
// For example, the ambiguity A/T can be indicated using W
// or [AT]. A nucleotide can be negated by preceding it with
// a '!'. The APAT algorithm allows for error during the
// matching process. The maximum number of tolerated error
// is indicated at the construction of the pattern using
// the errormax parameter. Some positions can be marked as not
// allowed for mismatches. They have to be signaled using a '#'
// sign after the corresponding nucleotide.
func MakeApatPattern(pattern string, errormax int, allowsIndel bool) (ApatPattern, error) {
cpattern := C.CString(pattern)
defer C.free(unsafe.Pointer(cpattern))
cerrormax := C.int32_t(errormax)
callosindel := C.uint8_t(0)
if allowsIndel {
callosindel = C.uint8_t(1)
}
var errno C.int32_t
var errmsg *C.char
apc := C.buildPattern(cpattern, cerrormax, callosindel, &errno, &errmsg)
if apc == nil {
message := C.GoString(errmsg)
C.free(unsafe.Pointer(errmsg))
return NilApatPattern, errors.New(message)
}
ap := _ApatPattern{apc,pattern}
runtime.SetFinalizer(&ap, func(p *_ApatPattern) {
// log.Printf("Finaliser called on %s\n", C.GoString(p.pointer.cpat))
C.free(unsafe.Pointer(p.pointer))
})
return ApatPattern{pointer: &ap}, nil
}
// ReverseComplement method builds a new ApatPattern
// matching the reverse complemented sequence of the original
// pattern.
func (pattern ApatPattern) ReverseComplement() (ApatPattern, error) {
var errno C.int32_t
var errmsg *C.char
apc := C.complementPattern((*C.Pattern)(pattern.pointer.pointer), &errno, &errmsg)
if apc == nil {
message := C.GoString(errmsg)
C.free(unsafe.Pointer(errmsg))
return ApatPattern{nil}, errors.New(message)
}
spat := C.GoString(apc.cpat)
ap := _ApatPattern{apc,spat}
runtime.SetFinalizer(&ap, func(p *_ApatPattern) {
// log.Printf("Finaliser called on %s\n", C.GoString(p.pointer.cpat))
C.free(unsafe.Pointer(p.pointer))
})
return ApatPattern{pointer: &ap}, nil
}
// String method casts the ApatPattern to a Go String.
func (pattern ApatPattern) String() string {
return pattern.pointer.pattern
//return C.GoString(pattern.pointer.pointer.cpat)
}
// Len method returns the length of the matched pattern.
func (pattern ApatPattern) Len() int {
return int(pattern.pointer.pointer.patlen)
}
// Release the C allocated memory of an ApatPattern instance.
//
// Thee method ensurse that the C structure wrapped in
// an ApatPattern instance is released. Normally this
// action is taken in charge by a finalizer and the call
// to the Free meethod is not mandatory. Nevertheless,
// If you choose to call this method, it will disconnect
// the finalizer associated to the ApatPattern instance
// to avoid double freeing.
func (pattern ApatPattern) Free() {
// log.Printf("Free called on %s\n", C.GoString(pattern.pointer.pointer.cpat))
if pattern.pointer != nil {
C.free(unsafe.Pointer(pattern.pointer.pointer))
runtime.SetFinalizer(pattern.pointer, nil)
pattern.pointer = nil
}
}
// Print method prints the ApatPattern to the standard output.
// This is mainly a debug method.
func (pattern ApatPattern) Print() {
C.PrintDebugPattern(C.PatternPtr(pattern.pointer.pointer))
}
// MakeApatSequence casts an obiseq.BioSequence to an ApatSequence.
// The circular parameter indicates the topology of the sequence.
// if sequence is circular (ciruclar = true), the match can occurs
// at the junction. To limit memory allocation, it is possible to provide
// an already allocated ApatSequence to recycle its allocated memory.
// The provided sequence is no more usable after the call.
func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...ApatSequence) (ApatSequence, error) {
var errno C.int32_t
var errmsg *C.char
seqlen := sequence.Len()
ic := 0
if circular {
ic = 1
}
var out *C.Seq
if len(recycle) > 0 {
out = recycle[0].pointer.pointer
} else {
out = nil
}
// copy the data into the buffer, by converting it to a Go array
p := unsafe.Pointer(unsafe.SliceData(sequence.Sequence()))
pseqc := C.new_apatseq((*C.char)(p), C.int32_t(ic), C.int32_t(seqlen),
(*C.Seq)(out),
&errno, &errmsg)
if pseqc == nil {
message := C.GoString(errmsg)
C.free(unsafe.Pointer(errmsg))
if p != nil {
C.free(p)
// atomic.AddInt64(&_AllocatedApaSequences, -1)
}
return NilApatSequence, errors.New(message)
}
if out == nil {
// log.Printf("Make ApatSeq called on %p -> %p\n", out, pseqc)
seq := _ApatSequence{pointer: pseqc,reference: sequence}
runtime.SetFinalizer(&seq, func(apat_p *_ApatSequence) {
var errno C.int32_t
var errmsg *C.char
log.Debugf("Finaliser called on %p\n", apat_p.pointer)
if apat_p != nil && apat_p.pointer != nil {
C.delete_apatseq(apat_p.pointer, &errno, &errmsg)
}
})
return ApatSequence{&seq}, nil
}
recycle[0].pointer.pointer = pseqc
recycle[0].pointer.reference = sequence
//log.Println(C.GoString(pseq.cseq))
return ApatSequence{recycle[0].pointer}, nil
}
// Len method returns the length of the ApatSequence.
func (sequence ApatSequence) Len() int {
return int(sequence.pointer.pointer.seqlen)
}
// Free method ensure that the C structure wrapped is
// desallocated
func (sequence ApatSequence) Free() {
var errno C.int32_t
var errmsg *C.char
log.Debugf("Free called on %p\n", sequence.pointer.pointer)
if sequence.pointer != nil && sequence.pointer.pointer != nil {
C.delete_apatseq(sequence.pointer.pointer,
&errno, &errmsg)
runtime.SetFinalizer(sequence.pointer, nil)
sequence.pointer = nil
}
}
// FindAllIndex methood returns the position of every occurrences of the
// pattern on the provided sequences. The search can be limited
// to a portion of the sequence by adding one or two integer parameters
// when calling the FindAllIndex method. The fisrt optional argument indicates
// the starting point of the search. The first nucleotide of the sequence is
// indexed as 0. The second optional argument indicates the length of the region
// where the pattern is looked for.
// The FindAllIndex methood returns return a slice of [3]int. The two firsts
// values of the [3]int indicate respectively the start and the end position of
// the match. Following the GO convention the end position is not included in the
// match. The third value indicates the number of error detected for this occurrence.
func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, begin, length int) (loc [][3]int) {
if begin < 0 {
begin = 0
}
if length < 0 {
length = sequence.Len()
}
nhits := int(C.ManberAll(sequence.pointer.pointer,
pattern.pointer.pointer,
0,
C.int32_t(begin),
C.int32_t(length+C.MAX_PAT_LEN)))
if nhits == 0 {
return nil
}
stktmp := (*[1 << 30]int32)(unsafe.Pointer(sequence.pointer.pointer.hitpos[0].val))
errtmp := (*[1 << 30]int32)(unsafe.Pointer(sequence.pointer.pointer.hiterr[0].val))
patlen := int(pattern.pointer.pointer.patlen)
for i := 0; i < nhits; i++ {
start := int(stktmp[i])
err := int(errtmp[i])
//log.Debugln(C.GoString(pattern.pointer.pointer.cpat), start, err)
loc = append(loc, [3]int{start, start + patlen, err})
}
//log.Debugln("------------")
return loc
}
func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (start int, end int, nerr int, matched bool) {
res := pattern.FindAllIndex(sequence, begin, length)
sbuffer := [(int(C.MAX_PAT_LEN) + int(C.MAX_PAT_ERR) + 1) * (int(C.MAX_PAT_LEN) + 1)]uint64{}
buffer := sbuffer[:]
if len(res) == 0 {
matched = false
return
}
matched = true
best := [3]int{0, 0, 10000}
for _, m := range res {
if m[2] < best[2] {
best = m
log.Debugln(best)
}
}
nerr = best[2]
end = best[1]
if nerr == 0 || !pattern.pointer.pointer.hasIndel {
start = best[0]
log.Debugln("No nws ", start, nerr)
return
}
start = best[0] - nerr
end = best[0] + int(pattern.pointer.pointer.patlen) + nerr
start = obiutils.MaxInt(start, 0)
end = obiutils.MinInt(end, sequence.Len())
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
frg := sequence.pointer.reference.Sequence()[start:end]
log.Debugln(
string(frg),
string((*cpattern)[0:int(pattern.pointer.pointer.patlen)]),
best[0], nerr, int(pattern.pointer.pointer.patlen),
sequence.Len(), start, end)
score, lali := obialign.FastLCSEGFScoreByte(
frg,
(*cpattern)[0:int(pattern.pointer.pointer.patlen)],
nerr, true, &buffer)
nerr = lali - score
start = best[0] + int(pattern.pointer.pointer.patlen) - lali
end = start + lali
log.Debugln("results", score, lali, start, nerr)
return
}
// tagaacaggctcctctag
// func AllocatedApaSequences() int {
// return int(_AllocatedApaSequences)
// }