Files
obitools4/pkg/obiapat/apat_search.c
Eric Coissac 27d6c60e25 Add the possibility to look for patterns allowing indels.
Former-commit-id: 0599c2b0ad16df086dbdb08e491503870d8904be
2023-03-20 15:28:24 +07:00

348 lines
11 KiB
C

/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Dec. 94 */
/* File: apat_search.c */
/* Purpose: recherche du pattern */
/* algorithme de Baeza-Yates/Gonnet */
/* Manber (agrep) */
/* History: */
/* 07/12/94 : <MFS> first version */
/* 28/12/94 : <Gloup> revised version */
/* 14/05/99 : <Gloup> last revision */
/* 07/12/21 : <Zafacs> last some cleaning for 2020 */
/* ==================================================== */
#include <stdlib.h>
#include <string.h>
#include "libstki.h"
#include "apat.h"
#define POP PopiOut
#define PUSH PushiIn
#define TOPCURS CursiToTop
#define DOWNREAD ReadiDown
//#define KRONECK(x, msk) ((~x & msk) ? 0 : 1)
#define KRONECK(x, msk) ((x & msk) ? 0 : 1)
#define MIN(x, y) ((x) < (y) ? (x) : (y))
/* -------------------------------------------- */
/* Construction de la matrice S */
/* -------------------------------------------- */
int CreateS(Pattern *ppat, int32_t lalpha)
{
int32_t indx, pindx, i, j;
patword_t amask, omask, *smat;
ppat->ok = false;
omask = 0x0L;
// if (! (smat = NEWN(uint32_t, lalpha)))
// return 0;
smat = ppat->smat;
for (i = 0 ; i < lalpha ; i++)
smat[i] = 0x0;
for (i = ppat->patlen - 1, amask = 0x1L ; i >= 0 ; i--, amask <<= 1) {
indx = ppat->patcode[i];
if (ppat->patcode[i] & OBLIBIT)
omask |= amask;
for (j = 0, pindx = 0x1L ; j < lalpha ; j++, pindx <<= 1)
if (indx & pindx)
smat[j] |= amask;
}
ppat->smat = smat;
ppat->omask = omask;
ppat->ok = true;
return 1;
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* NoError */
/* -------------------------------------------- */
int32_t ManberNoErr(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
int32_t pos;
patword_t smask, r;
uint8_t *data;
StackiPtr *stkpos, *stkerr;
int32_t end;
end = begin + length;
end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular);
/* create local masks */
smask = r = 0x1L << ppat->patlen;
/* init. scan */
data = pseq->data + begin;
stkpos = pseq->hitpos + patnum;
EmptyStacki(stkpos[0]);
stkerr = pseq->hiterr + patnum;
EmptyStacki(stkerr[0]);
/* loop on text data */
for (pos = begin ; pos < end ; pos++) {
r = (r >> 1) & ppat->smat[*data++];
if (r & 0x1L) {
PUSH(stkpos, pos - ppat->patlen + 1);
PUSH(stkerr, 0);
}
r |= smask;
}
return (*stkpos)->top; /* aka # of hits */
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* Substitution only */
/* */
/* Note : r array is stored as : */
/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */
/* */
/* -------------------------------------------- */
int32_t ManberSub(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
int e, emax, found;
uint32_t pos;
patword_t smask, cmask, sindx;
patword_t *pr, r[2 * MAX_PAT_ERR + 2];
uint8_t *data;
StackiPtr *stkpos, *stkerr;
uint32_t end;
end = begin + length;
end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular);
/* create local masks */
emax = ppat->maxerr;
r[0] = r[1] = 0x0;
cmask = smask = 0x1L << ppat->patlen;
for (e = 0, pr = r + 3 ; e <= emax ; e++, pr += 2)
*pr = cmask;
cmask = ~ ppat->omask;
/* init. scan */
data = pseq->data + begin;
stkpos = pseq->hitpos + patnum;
EmptyStacki(stkpos[0]);
stkerr = pseq->hiterr + patnum;
EmptyStacki(stkerr[0]);
/* loop on text data */
for (pos = begin ; pos < end ; pos++) {
sindx = ppat->smat[*data++];
for (e = found = 0, pr = r ; e <= emax ; e++, pr += 2) {
pr[2] = pr[3] | smask;
pr[3] = ((pr[0] >> 1) & cmask) /* sub */
| ((pr[2] >> 1) & sindx); /* ident */
if (pr[3] & 0x1L) { /* found */
if (! found) {
PUSH(stkpos, pos - ppat->patlen + 1);
PUSH(stkerr, e);
}
found++;
}
}
}
return (*stkpos)->top; /* aka # of hits */
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* Substitution + Indels */
/* */
/* Note : r array is stored as : */
/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */
/* */
/* Warning: may return shifted pos. */
/* */
/* -------------------------------------------- */
int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
int e, emax, found;
uint32_t pos;
patword_t smask, cmask, sindx;
patword_t *pr, r[2 * MAX_PAT_ERR + 2];
uint8_t *data;
StackiPtr *stkpos, *stkerr;
uint32_t end;
end = begin + length;
end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular);
/* create local masks */
emax = ppat->maxerr;
r[0] = r[1] = 0x0;
cmask = smask = 0x1L << ppat->patlen;
for (e = 0, pr = r + 3 ; e <= emax ; e++, pr += 2) {
*pr = cmask;
cmask = (cmask >> 1) | smask;
}
cmask = ~ ppat->omask;
/* init. scan */
data = pseq->data + begin;
stkpos = pseq->hitpos + patnum;
stkerr = pseq->hiterr + patnum;
EmptyStacki(stkpos[0]);
EmptyStacki(stkerr[0]);
/* loop on text data */
for (pos = begin ; pos < end ; pos++) {
sindx = ppat->smat[*data++];
for (e = found = 0, pr = r ; e <= emax ; e++, pr += 2) {
pr[2] = pr[3] | smask;
pr[3] = (( pr[0] /* ins */
| (pr[0] >> 1) /* sub */
| (pr[1] >> 1)) /* del */
& cmask)
| ((pr[2] >> 1) & sindx); /* ident */
if (pr[3] & 0x1L) { /* found */
if (! found) {
PUSH(stkpos, pos - ppat->patlen + 1);
PUSH(stkerr, e);
}
found++;
}
}
}
return (*stkpos)->top; /* aka # of hits */
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* API call to previous functions */
/* -------------------------------------------- */
int32_t ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
if (ppat->maxerr == 0){
return ManberNoErr(pseq, ppat, patnum, begin, length);}
else if (ppat->hasIndel) {
return ManberIndel(pseq, ppat, patnum, begin, length);
}
else {
return ManberSub(pseq, ppat, patnum, begin, length);
}
}
/* -------------------------------------------- */
/* Alignement NWS */
/* pour edition des hits */
/* (avec substitution obligatoire aux bords) */
/* -------------------------------------------- */
int32_t NwsPatAlign(Seq *pseq, Pattern *ppat,
int32_t nerr, int32_t begin,
int32_t *reslen, int32_t *reserr) {
uint8_t *sseq, *px;
int32_t i, j, lseq, lpat, npos, dindel, dsub,
*pc, *pi, *pd, *ps;
uint32_t amask;
static int32_t sTab[(MAX_PAT_LEN+MAX_PAT_ERR+1) * (MAX_PAT_LEN+1)];
lpat = ppat->patlen;
lseq = MIN(lpat + MAX_PAT_ERR+1, pseq->seqlen - begin);
sseq = pseq->data + begin - 1;
pc = sTab; /* |----|----| --> i */
pi = pc - 1; /* | ps | pd | | */
pd = pi - lseq; /* |----|----| | */
ps = pd - 1; /* | pi | pc | v j */
/* |---------| */
//amask = ONEMASK >> lpat;
amask = 0x1L << (ppat->patlen);
for (j = 0 ; j <= lpat ; j++) {
for (i = 0 , px = sseq ; i <= lseq ; i++, px++) {
if (i && j) {
dindel = MIN(*pi, *pd) + 1;
if (j == lpat) dindel--;
dsub = *ps + KRONECK(ppat->smat[*px], amask);
// fprintf(stderr, "mismatch : %d %d %d %d\n",j,*px,KRONECK(ppat->smat[*px], amask),amask);
*pc = MIN(dindel, dsub);
}
else if (i) /* j == 0 */
*pc = *pi + 1;
else if (j) /* i == 0 */
*pc = *pd;
else /* root */
*pc = 0;
fprintf(stderr," %02d",*pc);
pc++;
pi++;
pd++;
ps++;
}
fprintf(stderr,"\n");
// amask <<= 1;
amask >>= 1;
}
pc--;
for (i = lseq, npos = 0 ; i >= 0 ; i--, pc--) {
if (*pc <= nerr) {
*reslen++ = i;
*reserr++ = *pc;
npos++;
}
fprintf(stderr,"i=%d *pc = %d<%d, reserr = %d npos = %d\n",i,*pc,nerr,*(reserr-1),npos);
}
return npos;
}