Add the possibility to look for patterns allowing indels.

Former-commit-id: 0599c2b0ad16df086dbdb08e491503870d8904be
This commit is contained in:
2023-03-20 15:28:24 +07:00
parent 5fbe52368c
commit 27d6c60e25
14 changed files with 674 additions and 219 deletions

View File

@ -151,10 +151,7 @@ int32_t ManberNoErr (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t beg
int32_t ManberSub (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t ManberIndel (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t ManberAll (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t NwsPatAlign (Seq *pseq , Pattern *ppat, int32_t nerr ,
int32_t *reslen , int32_t *reserr);
/* apat_sys.c */
int32_t NwsPatAlign (Seq *pseq , Pattern *ppat, int32_t nerr, int32_t begin, int32_t *reslen, int32_t *reserr); /* apat_sys.c */
float UserCpuTime (int32_t reset);
float SysCpuTime (int32_t reset);

View File

@ -23,7 +23,8 @@
#define TOPCURS CursiToTop
#define DOWNREAD ReadiDown
#define KRONECK(x, msk) ((~x & msk) ? 0 : 1)
//#define KRONECK(x, msk) ((~x & msk) ? 0 : 1)
#define KRONECK(x, msk) ((x & msk) ? 0 : 1)
#define MIN(x, y) ((x) < (y) ? (x) : (y))
/* -------------------------------------------- */
@ -192,8 +193,8 @@ int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
int e, emax, found;
uint32_t pos;
uint32_t smask, cmask, sindx;
uint32_t *pr, r[2 * MAX_PAT_ERR + 2];
patword_t smask, cmask, sindx;
patword_t *pr, r[2 * MAX_PAT_ERR + 2];
uint8_t *data;
StackiPtr *stkpos, *stkerr;
uint32_t end;
@ -220,6 +221,9 @@ int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
stkpos = pseq->hitpos + patnum;
stkerr = pseq->hiterr + patnum;
EmptyStacki(stkpos[0]);
EmptyStacki(stkerr[0]);
/* loop on text data */
for (pos = begin ; pos < end ; pos++) {
@ -256,12 +260,14 @@ int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
/* -------------------------------------------- */
int32_t ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
if (ppat->maxerr == 0)
return ManberNoErr(pseq, ppat, patnum, begin, length);
else if (ppat->hasIndel)
if (ppat->maxerr == 0){
return ManberNoErr(pseq, ppat, patnum, begin, length);}
else if (ppat->hasIndel) {
return ManberIndel(pseq, ppat, patnum, begin, length);
else
}
else {
return ManberSub(pseq, ppat, patnum, begin, length);
}
}
@ -271,11 +277,9 @@ int32_t ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
/* (avec substitution obligatoire aux bords) */
/* -------------------------------------------- */
int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr)
Seq *pseq;
Pattern *ppat;
int32_t nerr, *reslen, *reserr;
{
int32_t NwsPatAlign(Seq *pseq, Pattern *ppat,
int32_t nerr, int32_t begin,
int32_t *reslen, int32_t *reserr) {
uint8_t *sseq, *px;
int32_t i, j, lseq, lpat, npos, dindel, dsub,
*pc, *pi, *pd, *ps;
@ -283,7 +287,9 @@ int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr)
static int32_t sTab[(MAX_PAT_LEN+MAX_PAT_ERR+1) * (MAX_PAT_LEN+1)];
lseq = pseq->seqlen;
lpat = ppat->patlen;
lseq = MIN(lpat + MAX_PAT_ERR+1, pseq->seqlen - begin);
sseq = pseq->data + begin - 1;
pc = sTab; /* |----|----| --> i */
pi = pc - 1; /* | ps | pd | | */
@ -291,36 +297,39 @@ int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr)
ps = pd - 1; /* | pi | pc | v j */
/* |---------| */
lseq = pseq->seqlen;
lpat = ppat->patlen;
sseq = pseq->data - 1;
amask = ONEMASK >> lpat;
//amask = ONEMASK >> lpat;
amask = 0x1L << (ppat->patlen);
for (j = 0 ; j <= lpat ; j++) {
for (i = 0 , px = sseq ; i <= lseq ; i++, px++) {
if (i && j) {
dindel = MIN(*pi, *pd) + 1;
if (j == lpat) dindel--;
dsub = *ps + KRONECK(ppat->smat[*px], amask);
// fprintf(stderr, "mismatch : %d %d %d %d\n",j,*px,KRONECK(ppat->smat[*px], amask),amask);
*pc = MIN(dindel, dsub);
}
else if (i) /* j == 0 */
*pc = *pi + 1;
else if (j) /* i == 0 */
*pc = *pd + 1;
*pc = *pd;
else /* root */
*pc = 0;
fprintf(stderr," %02d",*pc);
pc++;
pi++;
pd++;
ps++;
}
fprintf(stderr,"\n");
amask <<= 1;
// amask <<= 1;
amask >>= 1;
}
pc--;
@ -331,6 +340,7 @@ int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr)
*reserr++ = *pc;
npos++;
}
fprintf(stderr,"i=%d *pc = %d<%d, reserr = %d npos = %d\n",i,*pc,nerr,*(reserr-1),npos);
}
return npos;

View File

@ -337,7 +337,7 @@ int32_t delete_apatseq(SeqPtr pseq,
return 1;
}
PatternPtr buildPattern(const char *pat, int32_t error_max,
PatternPtr buildPattern(const char *pat, int32_t error_max, uint8_t hasIndel,
int *errno, char **errmsg)
{
PatternPtr pattern;
@ -355,7 +355,7 @@ PatternPtr buildPattern(const char *pat, int32_t error_max,
errno,errmsg);
pattern->ok = true;
pattern->hasIndel= false;
pattern->hasIndel= hasIndel;
pattern->maxerr = error_max;
pattern->cpat = (char*)pattern + sizeof(Pattern);

View File

@ -118,7 +118,7 @@ ecoseq_t *new_ecoseq_with_data( char *AC,
int32_t delete_apatseq(SeqPtr pseq,
int *errno, char **errmsg);
PatternPtr buildPattern(const char *pat, int32_t error_max, int *errno, char **errmsg);
PatternPtr buildPattern(const char *pat, int32_t error_max, uint8_t hasIndel, int *errno, char **errmsg);
PatternPtr complementPattern(PatternPtr pat, int *errno, char **errmsg);
SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen,

View File

@ -13,6 +13,8 @@ import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
@ -65,14 +67,20 @@ var NilApatSequence = ApatSequence{nil}
// the errormax parameter. Some positions can be marked as not
// allowed for mismatches. They have to be signaled using a '#'
// sign after the corresponding nucleotide.
func MakeApatPattern(pattern string, errormax int) (ApatPattern, error) {
func MakeApatPattern(pattern string, errormax int, allowsIndel bool) (ApatPattern, error) {
cpattern := C.CString(pattern)
defer C.free(unsafe.Pointer(cpattern))
cerrormax := C.int32_t(errormax)
callosindel := C.uint8_t(0)
if allowsIndel {
callosindel = C.uint8_t(1)
}
var errno C.int32_t
var errmsg *C.char
apc := C.buildPattern(cpattern, cerrormax, &errno, &errmsg)
apc := C.buildPattern(cpattern, cerrormax, callosindel, &errno, &errmsg)
if apc == nil {
message := C.GoString(errmsg)
@ -281,16 +289,13 @@ func (sequence ApatSequence) Free() {
// values of the [3]int indicate respectively the start and the end position of
// the match. Following the GO convention the end position is not included in the
// match. The third value indicates the number of error detected for this occurrence.
func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, limits ...int) (loc [][3]int) {
begin := 0
length := sequence.Len()
if len(limits) > 0 {
begin = limits[0]
func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, begin, length int) (loc [][3]int) {
if begin < 0 {
begin = 0
}
if len(limits) > 1 {
length = limits[1]
if length < 0 {
length = sequence.Len()
}
nhits := int(C.ManberAll(sequence.pointer.pointer,
@ -310,13 +315,70 @@ func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, limits ...int) (l
for i := 0; i < nhits; i++ {
start := int(stktmp[i])
err := int(errtmp[i])
log.Debugln(C.GoString(pattern.pointer.pointer.cpat), start, err)
loc = append(loc, [3]int{start, start + patlen, err})
}
log.Debugln("------------")
return loc
}
func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (start int, end int, nerr int, matched bool) {
res := pattern.FindAllIndex(sequence, begin, length)
sbuffer := [(int(C.MAX_PAT_LEN) + int(C.MAX_PAT_ERR) + 1) * (int(C.MAX_PAT_LEN) + 1)]uint64{}
buffer := sbuffer[:]
if len(res) == 0 {
matched = false
return
}
matched = true
best := [3]int{0, 0, 10000}
for _, m := range res {
if m[2] < best[2] {
best = m
log.Debugln(best)
}
}
nerr = best[2]
end = best[1]
if nerr == 0 || !pattern.pointer.pointer.hasIndel {
start = best[0]
log.Debugln("No nws ", start, nerr)
return
}
start = best[0] - nerr
end = best[0] + int(pattern.pointer.pointer.patlen) + nerr
start = goutils.MaxInt(start, 0)
end = goutils.MinInt(end, sequence.Len())
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
cseq := (*[1 << 30]byte)(unsafe.Pointer(sequence.pointer.pointer.cseq))
log.Debugln(
string((*cseq)[start:end]),
string((*cpattern)[0:int(pattern.pointer.pointer.patlen)]),
best[0], nerr, int(pattern.pointer.pointer.patlen),
sequence.Len(), start, end)
score, lali := obialign.FastLCSEGFScoreByte(
(*cseq)[start:end],
(*cpattern)[0:int(pattern.pointer.pointer.patlen)],
nerr*2, true, &buffer)
nerr = lali - score
start = best[0] + int(pattern.pointer.pointer.patlen) - lali
log.Println("results", score, lali, start, nerr)
return
}
// tagaacaggctcctctag
// func AllocatedApaSequences() int {
// return int(_AllocatedApaSequences)
// }

View File

@ -133,7 +133,7 @@ func OptionForwardPrimer(primer string, max int) WithOption {
f := WithOption(func(opt Options) {
var err error
opt.pointer.forward, err = MakeApatPattern(primer, max)
opt.pointer.forward, err = MakeApatPattern(primer, max, false)
if err != nil {
log.Fatalf("error : %v\n", err)
}
@ -155,7 +155,7 @@ func OptionReversePrimer(primer string, max int) WithOption {
f := WithOption(func(opt Options) {
var err error
opt.pointer.reverse, err = MakeApatPattern(primer, max)
opt.pointer.reverse, err = MakeApatPattern(primer, max, false)
if err != nil {
log.Fatalf("error : %v\n", err)
}
@ -210,7 +210,7 @@ func _Pcr(seq ApatSequence,
reverse := opt.pointer.reverse
crev := opt.pointer.crev
forwardMatches := forward.FindAllIndex(seq)
forwardMatches := forward.FindAllIndex(seq,0,-1)
if len(forwardMatches) > 0 {
@ -284,7 +284,7 @@ func _Pcr(seq ApatSequence,
}
}
forwardMatches = reverse.FindAllIndex(seq)
forwardMatches = reverse.FindAllIndex(seq,0,-1)
if forwardMatches != nil {
begin := forwardMatches[0][0]