Add the possibility to look for patterns allowing indels.

Former-commit-id: 0599c2b0ad16df086dbdb08e491503870d8904be
This commit is contained in:
2023-03-20 15:28:24 +07:00
parent 5fbe52368c
commit 27d6c60e25
14 changed files with 674 additions and 219 deletions

View File

@ -1,9 +1,9 @@
package obialign package obialign
import ( // import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" // "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" // "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
) // )
const wsize = 16 const wsize = 16
const dwsize = wsize * 2 const dwsize = wsize * 2
@ -58,182 +58,182 @@ var _empty = encodeValues(0, 0, false)
var _out = encodeValues(0, 30000, true) var _out = encodeValues(0, 30000, true)
var _notavail = encodeValues(0, 30000, false) var _notavail = encodeValues(0, 30000, false)
func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { // func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
lA := seqA.Len() // lA := seqA.Len()
lB := seqB.Len() // lB := seqB.Len()
// Ensure that A is the longest // // Ensure that A is the longest
if lA < lB { // if lA < lB {
seqA, seqB = seqB, seqA // seqA, seqB = seqB, seqA
lA, lB = lB, lA // lA, lB = lB, lA
} // }
if maxError == -1 { // if maxError == -1 {
maxError = lA * 2 // maxError = lA * 2
} // }
delta := lA - lB // delta := lA - lB
// The difference of length is larger the maximum allowed errors // // The difference of length is larger the maximum allowed errors
if delta > maxError { // if delta > maxError {
return -1, -1 // return -1, -1
} // }
// Doit-on vraiment diviser par deux ??? pas certain // // Doit-on vraiment diviser par deux ??? pas certain
extra := (maxError - delta) + 1 // extra := (maxError - delta) + 1
even := 1 + delta + 2*extra // even := 1 + delta + 2*extra
width := 2*even - 1 // width := 2*even - 1
if buffer == nil { // if buffer == nil {
var local []uint64 // var local []uint64
buffer = &local // buffer = &local
} // }
if cap(*buffer) < 2*width { // if cap(*buffer) < 2*width {
*buffer = make([]uint64, 3*width) // *buffer = make([]uint64, 3*width)
} // }
previous := (*buffer)[0:width] // previous := (*buffer)[0:width]
current := (*buffer)[width:(2 * width)] // current := (*buffer)[width:(2 * width)]
previous[extra] = _empty // previous[extra] = _empty
previous[extra+even] = encodeValues(0, 1, false) // previous[extra+even] = encodeValues(0, 1, false)
previous[extra+even-1] = encodeValues(0, 1, false) // previous[extra+even-1] = encodeValues(0, 1, false)
N := lB + ((delta) >> 1) // N := lB + ((delta) >> 1)
bA := seqA.Sequence() // bA := seqA.Sequence()
bB := seqB.Sequence() // bB := seqB.Sequence()
// log.Println("N = ", N) // // log.Println("N = ", N)
for y := 1; y <= N; y++ { // for y := 1; y <= N; y++ {
// in_matrix := false // // in_matrix := false
x1 := y - lB + extra // x1 := y - lB + extra
x2 := extra - y // x2 := extra - y
xs := goutils.MaxInt(goutils.MaxInt(x1, x2), 0) // xs := goutils.MaxInt(goutils.MaxInt(x1, x2), 0)
x1 = y + extra // x1 = y + extra
x2 = lA + extra - y // x2 = lA + extra - y
xf := goutils.MinInt(goutils.MinInt(x1, x2), even-1) + 1 // xf := goutils.MinInt(goutils.MinInt(x1, x2), even-1) + 1
for x := xs; x < xf; x++ { // for x := xs; x < xf; x++ {
i := y - x + extra // i := y - x + extra
j := y + x - extra // j := y + x - extra
var Sdiag, Sleft, Sup uint64 // var Sdiag, Sleft, Sup uint64
switch { // switch {
case i == 0: // case i == 0:
Sup = _notavail // Sup = _notavail
Sdiag = _notavail // Sdiag = _notavail
Sleft = encodeValues(0, j-1, false) // Sleft = encodeValues(0, j-1, false)
case j == 0: // case j == 0:
Sup = encodeValues(0, i-1, false) // Sup = encodeValues(0, i-1, false)
Sdiag = _notavail // Sdiag = _notavail
Sleft = _notavail // Sleft = _notavail
default: // default:
Sdiag = previous[x] // Sdiag = previous[x]
if bA[j-1] == bB[i-1] { // if bA[j-1] == bB[i-1] {
Sdiag = _incscore(Sdiag) // Sdiag = _incscore(Sdiag)
} // }
if x < (even - 1) { // if x < (even - 1) {
Sup = previous[x+even] // Sup = previous[x+even]
} else { // } else {
Sup = _out // Sup = _out
} // }
if x > 0 { // if x > 0 {
Sleft = previous[x+even-1] // Sleft = previous[x+even-1]
} else { // } else {
Sleft = _out // Sleft = _out
} // }
} // }
var score uint64 // var score uint64
switch { // switch {
case Sdiag >= Sup && Sdiag >= Sleft: // case Sdiag >= Sup && Sdiag >= Sleft:
score = Sdiag // score = Sdiag
case Sup >= Sleft: // case Sup >= Sleft:
score = Sup // score = Sup
default: // default:
score = Sleft // score = Sleft
} // }
if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) { // if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) {
score = _setout(score) // score = _setout(score)
} // }
current[x] = _incpath(score) // current[x] = _incpath(score)
} // }
// . 9 10 + 2 - 1 // // . 9 10 + 2 - 1
x1 = y - lB + extra + even // x1 = y - lB + extra + even
x2 = extra - y + even - 1 // x2 = extra - y + even - 1
xs = goutils.MaxInt(goutils.MaxInt(x1, x2), even) // xs = goutils.MaxInt(goutils.MaxInt(x1, x2), even)
x1 = y + extra + even // x1 = y + extra + even
x2 = lA + extra - y + even - 1 // x2 = lA + extra - y + even - 1
xf = goutils.MinInt(goutils.MinInt(x1, x2), width-1) + 1 // xf = goutils.MinInt(goutils.MinInt(x1, x2), width-1) + 1
for x := xs; x < xf; x++ { // for x := xs; x < xf; x++ {
i := y - x + extra + even // i := y - x + extra + even
j := y + x - extra - even + 1 // j := y + x - extra - even + 1
var Sdiag, Sleft, Sup uint64 // var Sdiag, Sleft, Sup uint64
switch { // switch {
case i == 0: // case i == 0:
Sup = _notavail // Sup = _notavail
Sdiag = _notavail // Sdiag = _notavail
Sleft = encodeValues(0, j-1, false) // Sleft = encodeValues(0, j-1, false)
case j == 0: // case j == 0:
Sup = encodeValues(0, i-1, false) // Sup = encodeValues(0, i-1, false)
Sdiag = _notavail // Sdiag = _notavail
Sleft = _notavail // Sleft = _notavail
default: // default:
Sdiag = previous[x] // Sdiag = previous[x]
if bA[j-1] == bB[i-1] { // if bA[j-1] == bB[i-1] {
Sdiag = _incscore(Sdiag) // Sdiag = _incscore(Sdiag)
} // }
Sleft = current[x-even] // Sleft = current[x-even]
Sup = current[x-even+1] // Sup = current[x-even+1]
} // }
var score uint64 // var score uint64
switch { // switch {
case Sdiag >= Sup && Sdiag >= Sleft: // case Sdiag >= Sup && Sdiag >= Sleft:
score = Sdiag // score = Sdiag
case Sup >= Sleft: // case Sup >= Sleft:
score = Sup // score = Sup
default: // default:
score = Sleft // score = Sleft
} // }
if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) { // if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) {
score = _setout(score) // score = _setout(score)
} // }
current[x] = _incpath(score) // current[x] = _incpath(score)
} // }
previous, current = current, previous // previous, current = current, previous
} // }
s, l, o := decodeValues(previous[(delta%2)*even+extra+(delta>>1)]) // s, l, o := decodeValues(previous[(delta%2)*even+extra+(delta>>1)])
if o { // if o {
return -1, -1 // return -1, -1
} // }
return s, l // return s, l
} // }

319
pkg/obialign/fastlcsegf.go Normal file
View File

@ -0,0 +1,319 @@
package obialign
import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
log "github.com/sirupsen/logrus"
)
var _iupac = [26]byte{
// a b c d e f
1, 14, 2, 13, 0, 0,
// g h i j k l
4, 11, 0, 0, 12, 0,
// m n o p q r
3, 15, 0, 0, 0, 5,
// s t u v w x
6, 8, 8, 13, 9, 0,
// y z
10, 0,
}
func _samenuc(a, b byte) bool {
if (a >= 'A') && (a <= 'Z') {
a |= 32
}
if (b >= 'A') && (b <= 'Z') {
b |= 32
}
if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') {
return (_iupac[a-'a'] & _iupac[b-'a']) > 0
}
return a == b
}
func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int) {
lA := len(bA)
lB := len(bB)
// Ensure that A is the longest
if lA < lB {
bA, bB = bB, bA
lA, lB = lB, lA
}
if maxError == -1 {
maxError = lA * 2
}
delta := lA - lB
// The difference of length is larger the maximum allowed errors
if delta > maxError {
return -1, -1
}
// // BEGINNING OF DEBUG CODE //
// debug_scores := make([][]int, lB+1)
// for i := range debug_scores {
// debug_scores[i] = make([]int, lA+1)
// }
// debug_path := make([][]int, lB+1)
// for i := range debug_path {
// debug_path[i] = make([]int, lA+1)
// }
// debug_out := make([][]bool, lB+1)
// for i := range debug_out {
// debug_out[i] = make([]bool, lA+1)
// }
// // END OF DEBUG CODE //
// Doit-on vraiment diviser par deux ??? pas certain
extra := (maxError - delta) + 1
even := 1 + delta + 2*extra
width := 2*even - 1
if buffer == nil {
var local []uint64
buffer = &local
}
if cap(*buffer) < 2*width {
*buffer = make([]uint64, 3*width)
}
previous := (*buffer)[0:width]
current := (*buffer)[width:(2 * width)]
previous[extra] = _empty // Initialise cell 0,0
if endgapfree { // Initialise cell 0,1
previous[extra+even] = encodeValues(0, 0, false)
} else {
previous[extra+even] = encodeValues(0, 1, false)
}
previous[extra+even-1] = encodeValues(0, 1, false) // Initialise cell 1,0
N := lB + ((delta) >> 1)
log.Debugln("N = ", N, " delta = ", delta, " extra = ", extra, " maxError = ", maxError)
for y := 1; y <= N; y++ {
// in_matrix := false
x1 := y - lB + extra
x2 := extra - y
xs := goutils.MaxInt(goutils.MaxInt(x1, x2), 0)
x1 = y + extra
x2 = lA + extra - y
xf := goutils.MinInt(goutils.MinInt(x1, x2), even-1) + 1
for x := xs; x < xf; x++ {
// i span along B and j along A
i := y - x + extra
j := y + x - extra
// log.Debugln("Coord : ", i, j)
var Sdiag, Sleft, Sup uint64
switch {
case i == 0:
Sup = _notavail
Sdiag = _notavail
if endgapfree {
Sleft = encodeValues(0, 0, false)
} else {
Sleft = encodeValues(0, j, false)
}
case j == 0:
Sup = encodeValues(0, i, false)
Sdiag = _notavail
Sleft = _notavail
default:
Sdiag = _incpath(previous[x])
if _samenuc(bA[j-1], bB[i-1]) {
Sdiag = _incscore(Sdiag)
}
if x < (even - 1) {
Sup = _incpath(previous[x+even])
} else {
Sup = _out
}
if x > 0 {
Sleft = previous[x+even-1]
if (i > 0 && i < lB) || !endgapfree {
Sleft = _incpath(Sleft)
}
} else {
Sleft = _out
}
}
var score uint64
switch {
case Sdiag >= Sup && Sdiag >= Sleft:
score = Sdiag
case Sup >= Sleft:
score = Sup
default:
score = Sleft
}
// I supose the bug was here
// if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) {
// score = _setout(score)
// }
// // BEGINNING OF DEBUG CODE //
// if i < 2 && j < 5 {
// log.Debugf("[%d,%d]\n",i,j)
// s, p, o := decodeValues(Sdiag)
// log.Debugf("+Sdiag (%v) : %d, %d, %v\n",Sdiag,s,p,o)
// s, p, o = decodeValues(Sup)
// log.Debugf("+Sup (%v) : %d, %d, %v\n",Sup,s,p,o)
// s, p, o = decodeValues(Sleft)
// log.Debugf("+Sleft (%v) : %d, %d, %v\n",Sleft,s,p,o)
// s, p, o = decodeValues(score)
// log.Debugf("+score (%v) : %d, %d, %v\n",score,s,p,o)
// log.Debugln("-------------------")
// }
// s, p, o := decodeValues(score)
// debug_scores[i][j] = s
// debug_path[i][j] = p
// debug_out[i][j] = o
// // END OF DEBUG CODE //
current[x] = score
}
// . 9 10 + 2 - 1
x1 = y - lB + extra + even
x2 = extra - y + even - 1
xs = goutils.MaxInt(goutils.MaxInt(x1, x2), even)
x1 = y + extra + even
x2 = lA + extra - y + even - 1
xf = goutils.MinInt(goutils.MinInt(x1, x2), width-1) + 1
for x := xs; x < xf; x++ {
i := y - x + extra + even
j := y + x - extra - even + 1
//log.Debugln("Coord : ", i, j)
var Sdiag, Sleft, Sup uint64
switch {
case i == 0:
Sup = _notavail
Sdiag = _notavail
if endgapfree {
Sleft = encodeValues(0, 0, false)
} else {
Sleft = encodeValues(0, j, false)
}
case j == 0:
Sup = encodeValues(0, i, false)
Sdiag = _notavail
Sleft = _notavail
default:
Sdiag = _incpath(previous[x])
if _samenuc(bA[j-1], bB[i-1]) {
Sdiag = _incscore(Sdiag)
}
Sleft = current[x-even]
if (i > 0 && i < lB) || !endgapfree {
Sleft = _incpath(Sleft)
}
Sup = _incpath(current[x-even+1])
}
var score uint64
switch {
case Sdiag >= Sup && Sdiag >= Sleft:
score = Sdiag
case Sup >= Sleft:
score = Sup
default:
score = Sleft
}
// I supose the bug was here
// if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) {
// score = _setout(score)
// }
// // BEGINNING OF DEBUG CODE //
// if i < 2 && j < 5 {
// log.Debugf("[%d,%d]\n",i,j)
// s, p, o := decodeValues(Sdiag)
// log.Debugf("-Sdiag (%v) : %d, %d, %v\n",Sdiag,s,p,o)
// s, p, o = decodeValues(Sup)
// log.Debugf("-Sup (%v) : %d, %d, %v\n",Sup,s,p,o)
// s, p, o = decodeValues(Sleft)
// log.Debugf("-Sleft (%v) : %d, %d, %v\n",Sleft,s,p,o)
// s, p, o = decodeValues(score)
// log.Debugf("-score (%v) : %d, %d, %v\n",score,s,p,o)
// log.Debugln("-------------------")
// }
// s, p, o := decodeValues(score)
// debug_scores[i][j] = s
// debug_path[i][j] = p
// debug_out[i][j] = o
// // END OF DEBUG CODE //
current[x] = score
}
previous, current = current, previous
}
s, l, o := decodeValues(previous[(delta%2)*even+extra+(delta>>1)])
// // BEGINNING OF DEBUG CODE //
// fmt.Printf("%2c\t", ' ')
// for j := 0; j <= lA; j++ {
// if j > 0 {
// fmt.Printf("%11c\t", bA[j-1])
// } else {
// fmt.Printf("%11c\t", '-')
// }
// }
// fmt.Printf("\n")
// for i := 0; i <= lB; i++ {
// if i > 0 {
// fmt.Printf("%2c\t", bB[i-1])
// } else {
// fmt.Printf("%2c\t", '-')
// }
// for j := 0; j <= lA; j++ {
// fmt.Printf("%2d:%2d:%v\t", debug_scores[i][j],
// debug_path[i][j], debug_out[i][j])
// }
// fmt.Printf("\n")
// }
// // end OF DEBUG CODE //
if o {
return -1, -1
}
return s, l
}
func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer)
}
func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer)
}

View File

@ -151,10 +151,7 @@ int32_t ManberNoErr (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t beg
int32_t ManberSub (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length); int32_t ManberSub (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t ManberIndel (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length); int32_t ManberIndel (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t ManberAll (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length); int32_t ManberAll (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t NwsPatAlign (Seq *pseq , Pattern *ppat, int32_t nerr , int32_t NwsPatAlign (Seq *pseq , Pattern *ppat, int32_t nerr, int32_t begin, int32_t *reslen, int32_t *reserr); /* apat_sys.c */
int32_t *reslen , int32_t *reserr);
/* apat_sys.c */
float UserCpuTime (int32_t reset); float UserCpuTime (int32_t reset);
float SysCpuTime (int32_t reset); float SysCpuTime (int32_t reset);

View File

@ -23,7 +23,8 @@
#define TOPCURS CursiToTop #define TOPCURS CursiToTop
#define DOWNREAD ReadiDown #define DOWNREAD ReadiDown
#define KRONECK(x, msk) ((~x & msk) ? 0 : 1) //#define KRONECK(x, msk) ((~x & msk) ? 0 : 1)
#define KRONECK(x, msk) ((x & msk) ? 0 : 1)
#define MIN(x, y) ((x) < (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y))
/* -------------------------------------------- */ /* -------------------------------------------- */
@ -192,8 +193,8 @@ int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{ {
int e, emax, found; int e, emax, found;
uint32_t pos; uint32_t pos;
uint32_t smask, cmask, sindx; patword_t smask, cmask, sindx;
uint32_t *pr, r[2 * MAX_PAT_ERR + 2]; patword_t *pr, r[2 * MAX_PAT_ERR + 2];
uint8_t *data; uint8_t *data;
StackiPtr *stkpos, *stkerr; StackiPtr *stkpos, *stkerr;
uint32_t end; uint32_t end;
@ -220,6 +221,9 @@ int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
stkpos = pseq->hitpos + patnum; stkpos = pseq->hitpos + patnum;
stkerr = pseq->hiterr + patnum; stkerr = pseq->hiterr + patnum;
EmptyStacki(stkpos[0]);
EmptyStacki(stkerr[0]);
/* loop on text data */ /* loop on text data */
for (pos = begin ; pos < end ; pos++) { for (pos = begin ; pos < end ; pos++) {
@ -256,12 +260,14 @@ int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
/* -------------------------------------------- */ /* -------------------------------------------- */
int32_t ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) int32_t ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{ {
if (ppat->maxerr == 0) if (ppat->maxerr == 0){
return ManberNoErr(pseq, ppat, patnum, begin, length); return ManberNoErr(pseq, ppat, patnum, begin, length);}
else if (ppat->hasIndel) else if (ppat->hasIndel) {
return ManberIndel(pseq, ppat, patnum, begin, length); return ManberIndel(pseq, ppat, patnum, begin, length);
else }
else {
return ManberSub(pseq, ppat, patnum, begin, length); return ManberSub(pseq, ppat, patnum, begin, length);
}
} }
@ -271,11 +277,9 @@ int32_t ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
/* (avec substitution obligatoire aux bords) */ /* (avec substitution obligatoire aux bords) */
/* -------------------------------------------- */ /* -------------------------------------------- */
int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr) int32_t NwsPatAlign(Seq *pseq, Pattern *ppat,
Seq *pseq; int32_t nerr, int32_t begin,
Pattern *ppat; int32_t *reslen, int32_t *reserr) {
int32_t nerr, *reslen, *reserr;
{
uint8_t *sseq, *px; uint8_t *sseq, *px;
int32_t i, j, lseq, lpat, npos, dindel, dsub, int32_t i, j, lseq, lpat, npos, dindel, dsub,
*pc, *pi, *pd, *ps; *pc, *pi, *pd, *ps;
@ -283,7 +287,9 @@ int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr)
static int32_t sTab[(MAX_PAT_LEN+MAX_PAT_ERR+1) * (MAX_PAT_LEN+1)]; static int32_t sTab[(MAX_PAT_LEN+MAX_PAT_ERR+1) * (MAX_PAT_LEN+1)];
lseq = pseq->seqlen; lpat = ppat->patlen;
lseq = MIN(lpat + MAX_PAT_ERR+1, pseq->seqlen - begin);
sseq = pseq->data + begin - 1;
pc = sTab; /* |----|----| --> i */ pc = sTab; /* |----|----| --> i */
pi = pc - 1; /* | ps | pd | | */ pi = pc - 1; /* | ps | pd | | */
@ -291,36 +297,39 @@ int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr)
ps = pd - 1; /* | pi | pc | v j */ ps = pd - 1; /* | pi | pc | v j */
/* |---------| */ /* |---------| */
lseq = pseq->seqlen;
lpat = ppat->patlen;
sseq = pseq->data - 1;
amask = ONEMASK >> lpat; //amask = ONEMASK >> lpat;
amask = 0x1L << (ppat->patlen);
for (j = 0 ; j <= lpat ; j++) { for (j = 0 ; j <= lpat ; j++) {
for (i = 0 , px = sseq ; i <= lseq ; i++, px++) { for (i = 0 , px = sseq ; i <= lseq ; i++, px++) {
if (i && j) { if (i && j) {
dindel = MIN(*pi, *pd) + 1; dindel = MIN(*pi, *pd) + 1;
if (j == lpat) dindel--;
dsub = *ps + KRONECK(ppat->smat[*px], amask); dsub = *ps + KRONECK(ppat->smat[*px], amask);
// fprintf(stderr, "mismatch : %d %d %d %d\n",j,*px,KRONECK(ppat->smat[*px], amask),amask);
*pc = MIN(dindel, dsub); *pc = MIN(dindel, dsub);
} }
else if (i) /* j == 0 */ else if (i) /* j == 0 */
*pc = *pi + 1; *pc = *pi + 1;
else if (j) /* i == 0 */ else if (j) /* i == 0 */
*pc = *pd + 1; *pc = *pd;
else /* root */ else /* root */
*pc = 0; *pc = 0;
fprintf(stderr," %02d",*pc);
pc++; pc++;
pi++; pi++;
pd++; pd++;
ps++; ps++;
} }
fprintf(stderr,"\n");
amask <<= 1; // amask <<= 1;
amask >>= 1;
} }
pc--; pc--;
@ -331,6 +340,7 @@ int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr)
*reserr++ = *pc; *reserr++ = *pc;
npos++; npos++;
} }
fprintf(stderr,"i=%d *pc = %d<%d, reserr = %d npos = %d\n",i,*pc,nerr,*(reserr-1),npos);
} }
return npos; return npos;

View File

@ -337,7 +337,7 @@ int32_t delete_apatseq(SeqPtr pseq,
return 1; return 1;
} }
PatternPtr buildPattern(const char *pat, int32_t error_max, PatternPtr buildPattern(const char *pat, int32_t error_max, uint8_t hasIndel,
int *errno, char **errmsg) int *errno, char **errmsg)
{ {
PatternPtr pattern; PatternPtr pattern;
@ -355,7 +355,7 @@ PatternPtr buildPattern(const char *pat, int32_t error_max,
errno,errmsg); errno,errmsg);
pattern->ok = true; pattern->ok = true;
pattern->hasIndel= false; pattern->hasIndel= hasIndel;
pattern->maxerr = error_max; pattern->maxerr = error_max;
pattern->cpat = (char*)pattern + sizeof(Pattern); pattern->cpat = (char*)pattern + sizeof(Pattern);

View File

@ -118,7 +118,7 @@ ecoseq_t *new_ecoseq_with_data( char *AC,
int32_t delete_apatseq(SeqPtr pseq, int32_t delete_apatseq(SeqPtr pseq,
int *errno, char **errmsg); int *errno, char **errmsg);
PatternPtr buildPattern(const char *pat, int32_t error_max, int *errno, char **errmsg); PatternPtr buildPattern(const char *pat, int32_t error_max, uint8_t hasIndel, int *errno, char **errmsg);
PatternPtr complementPattern(PatternPtr pat, int *errno, char **errmsg); PatternPtr complementPattern(PatternPtr pat, int *errno, char **errmsg);
SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen, SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen,

View File

@ -13,6 +13,8 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
) )
@ -65,14 +67,20 @@ var NilApatSequence = ApatSequence{nil}
// the errormax parameter. Some positions can be marked as not // the errormax parameter. Some positions can be marked as not
// allowed for mismatches. They have to be signaled using a '#' // allowed for mismatches. They have to be signaled using a '#'
// sign after the corresponding nucleotide. // sign after the corresponding nucleotide.
func MakeApatPattern(pattern string, errormax int) (ApatPattern, error) { func MakeApatPattern(pattern string, errormax int, allowsIndel bool) (ApatPattern, error) {
cpattern := C.CString(pattern) cpattern := C.CString(pattern)
defer C.free(unsafe.Pointer(cpattern)) defer C.free(unsafe.Pointer(cpattern))
cerrormax := C.int32_t(errormax) cerrormax := C.int32_t(errormax)
callosindel := C.uint8_t(0)
if allowsIndel {
callosindel = C.uint8_t(1)
}
var errno C.int32_t var errno C.int32_t
var errmsg *C.char var errmsg *C.char
apc := C.buildPattern(cpattern, cerrormax, &errno, &errmsg) apc := C.buildPattern(cpattern, cerrormax, callosindel, &errno, &errmsg)
if apc == nil { if apc == nil {
message := C.GoString(errmsg) message := C.GoString(errmsg)
@ -281,16 +289,13 @@ func (sequence ApatSequence) Free() {
// values of the [3]int indicate respectively the start and the end position of // values of the [3]int indicate respectively the start and the end position of
// the match. Following the GO convention the end position is not included in the // the match. Following the GO convention the end position is not included in the
// match. The third value indicates the number of error detected for this occurrence. // match. The third value indicates the number of error detected for this occurrence.
func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, limits ...int) (loc [][3]int) { func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, begin, length int) (loc [][3]int) {
begin := 0 if begin < 0 {
length := sequence.Len() begin = 0
if len(limits) > 0 {
begin = limits[0]
} }
if len(limits) > 1 { if length < 0 {
length = limits[1] length = sequence.Len()
} }
nhits := int(C.ManberAll(sequence.pointer.pointer, nhits := int(C.ManberAll(sequence.pointer.pointer,
@ -310,13 +315,70 @@ func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, limits ...int) (l
for i := 0; i < nhits; i++ { for i := 0; i < nhits; i++ {
start := int(stktmp[i]) start := int(stktmp[i])
err := int(errtmp[i]) err := int(errtmp[i])
log.Debugln(C.GoString(pattern.pointer.pointer.cpat), start, err)
loc = append(loc, [3]int{start, start + patlen, err}) loc = append(loc, [3]int{start, start + patlen, err})
} }
log.Debugln("------------")
return loc return loc
} }
func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (start int, end int, nerr int, matched bool) {
res := pattern.FindAllIndex(sequence, begin, length)
sbuffer := [(int(C.MAX_PAT_LEN) + int(C.MAX_PAT_ERR) + 1) * (int(C.MAX_PAT_LEN) + 1)]uint64{}
buffer := sbuffer[:]
if len(res) == 0 {
matched = false
return
}
matched = true
best := [3]int{0, 0, 10000}
for _, m := range res {
if m[2] < best[2] {
best = m
log.Debugln(best)
}
}
nerr = best[2]
end = best[1]
if nerr == 0 || !pattern.pointer.pointer.hasIndel {
start = best[0]
log.Debugln("No nws ", start, nerr)
return
}
start = best[0] - nerr
end = best[0] + int(pattern.pointer.pointer.patlen) + nerr
start = goutils.MaxInt(start, 0)
end = goutils.MinInt(end, sequence.Len())
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
cseq := (*[1 << 30]byte)(unsafe.Pointer(sequence.pointer.pointer.cseq))
log.Debugln(
string((*cseq)[start:end]),
string((*cpattern)[0:int(pattern.pointer.pointer.patlen)]),
best[0], nerr, int(pattern.pointer.pointer.patlen),
sequence.Len(), start, end)
score, lali := obialign.FastLCSEGFScoreByte(
(*cseq)[start:end],
(*cpattern)[0:int(pattern.pointer.pointer.patlen)],
nerr*2, true, &buffer)
nerr = lali - score
start = best[0] + int(pattern.pointer.pointer.patlen) - lali
log.Println("results", score, lali, start, nerr)
return
}
// tagaacaggctcctctag
// func AllocatedApaSequences() int { // func AllocatedApaSequences() int {
// return int(_AllocatedApaSequences) // return int(_AllocatedApaSequences)
// } // }

View File

@ -133,7 +133,7 @@ func OptionForwardPrimer(primer string, max int) WithOption {
f := WithOption(func(opt Options) { f := WithOption(func(opt Options) {
var err error var err error
opt.pointer.forward, err = MakeApatPattern(primer, max) opt.pointer.forward, err = MakeApatPattern(primer, max, false)
if err != nil { if err != nil {
log.Fatalf("error : %v\n", err) log.Fatalf("error : %v\n", err)
} }
@ -155,7 +155,7 @@ func OptionReversePrimer(primer string, max int) WithOption {
f := WithOption(func(opt Options) { f := WithOption(func(opt Options) {
var err error var err error
opt.pointer.reverse, err = MakeApatPattern(primer, max) opt.pointer.reverse, err = MakeApatPattern(primer, max, false)
if err != nil { if err != nil {
log.Fatalf("error : %v\n", err) log.Fatalf("error : %v\n", err)
} }
@ -210,7 +210,7 @@ func _Pcr(seq ApatSequence,
reverse := opt.pointer.reverse reverse := opt.pointer.reverse
crev := opt.pointer.crev crev := opt.pointer.crev
forwardMatches := forward.FindAllIndex(seq) forwardMatches := forward.FindAllIndex(seq,0,-1)
if len(forwardMatches) > 0 { if len(forwardMatches) > 0 {
@ -284,7 +284,7 @@ func _Pcr(seq ApatSequence,
} }
} }
forwardMatches = reverse.FindAllIndex(seq) forwardMatches = reverse.FindAllIndex(seq,0,-1)
if forwardMatches != nil { if forwardMatches != nil {
begin := forwardMatches[0][0] begin := forwardMatches[0][0]

View File

@ -27,11 +27,11 @@ type DemultiplexMatch struct {
Error error Error error
} }
func (library *NGSLibrary) Compile(maxError int) error { func (library *NGSLibrary) Compile(maxError int, allowsIndel bool) error {
for primers, marker := range *library { for primers, marker := range *library {
err := marker.Compile(primers.Forward, err := marker.Compile(primers.Forward,
primers.Reverse, primers.Reverse,
maxError) maxError, allowsIndel)
if err != nil { if err != nil {
return err return err
} }
@ -57,15 +57,13 @@ func (library *NGSLibrary) ExtractBarcode(sequence *obiseq.BioSequence, inplace
return match.ExtractBarcode(sequence, inplace) return match.ExtractBarcode(sequence, inplace)
} }
func (marker *Marker) Compile(forward, reverse string, maxError int) error { func (marker *Marker) Compile(forward, reverse string, maxError int, allowsIndel bool) error {
var err error var err error
marker.forward, err = obiapat.MakeApatPattern(forward, marker.forward, err = obiapat.MakeApatPattern(forward, maxError, allowsIndel)
maxError)
if err != nil { if err != nil {
return err return err
} }
marker.reverse, err = obiapat.MakeApatPattern(reverse, marker.reverse, err = obiapat.MakeApatPattern(reverse, maxError, allowsIndel)
maxError)
if err != nil { if err != nil {
return err return err
} }
@ -105,33 +103,34 @@ func (marker *Marker) Compile(forward, reverse string, maxError int) error {
return nil return nil
} }
func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
aseq, _ := obiapat.MakeApatSequence(sequence, false) aseq, _ := obiapat.MakeApatSequence(sequence, false)
match := marker.forward.FindAllIndex(aseq, marker.taglength) start, end, nerr ,matched := marker.forward.BestMatch(aseq, marker.taglength,-1)
if matched {
if len(match) > 0 {
sseq := sequence.String() sseq := sequence.String()
direct := sseq[match[0][0]:match[0][1]] direct := sseq[start:end]
ftag := strings.ToLower(sseq[(match[0][0] - marker.taglength):match[0][0]]) ftag := strings.ToLower(sseq[(start - marker.taglength):start])
m := DemultiplexMatch{ m := DemultiplexMatch{
ForwardMatch: direct, ForwardMatch: direct,
ForwardTag: ftag, ForwardTag: ftag,
BarcodeStart: match[0][1], BarcodeStart: end,
ForwardMismatches: match[0][2], ForwardMismatches: nerr,
IsDirect: true, IsDirect: true,
Error: nil, Error: nil,
} }
rmatch := marker.creverse.FindAllIndex(aseq, match[0][1]) start, end, nerr ,matched = marker.creverse.BestMatch(aseq, start,-1)
if len(rmatch) > 0 { if matched {
// extracting primer matches // extracting primer matches
reverse, _ := sequence.Subsequence(rmatch[0][0], rmatch[0][1], false) reverse, _ := sequence.Subsequence(start,end, false)
defer reverse.Recycle() defer reverse.Recycle()
reverse = reverse.ReverseComplement(true) reverse = reverse.ReverseComplement(true)
rtag, err := sequence.Subsequence(rmatch[0][1], rmatch[0][1]+marker.taglength, false) rtag, err := sequence.Subsequence(end, end+marker.taglength, false)
defer rtag.Recycle() defer rtag.Recycle()
srtag := "" srtag := ""
@ -143,8 +142,8 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
} }
m.ReverseMatch = strings.ToLower(reverse.String()) m.ReverseMatch = strings.ToLower(reverse.String())
m.ReverseMismatches = rmatch[0][2] m.ReverseMismatches = nerr
m.BarcodeEnd = rmatch[0][0] m.BarcodeEnd = start
m.ReverseTag = srtag m.ReverseTag = srtag
sample, ok := marker.samples[TagPair{ftag, srtag}] sample, ok := marker.samples[TagPair{ftag, srtag}]
@ -162,32 +161,32 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
return &m return &m
} }
match = marker.reverse.FindAllIndex(aseq, marker.taglength) start, end, nerr ,matched = marker.reverse.BestMatch(aseq, marker.taglength,-1)
if len(match) > 0 { if matched {
sseq := sequence.String() sseq := sequence.String()
reverse := strings.ToLower(sseq[match[0][0]:match[0][1]]) reverse := strings.ToLower(sseq[start:end])
rtag := strings.ToLower(sseq[(match[0][0] - marker.taglength):match[0][0]]) rtag := strings.ToLower(sseq[(start - marker.taglength):start])
m := DemultiplexMatch{ m := DemultiplexMatch{
ReverseMatch: reverse, ReverseMatch: reverse,
ReverseTag: rtag, ReverseTag: rtag,
BarcodeStart: match[0][1], BarcodeStart: end,
ReverseMismatches: match[0][2], ReverseMismatches: nerr,
IsDirect: false, IsDirect: false,
Error: nil, Error: nil,
} }
rmatch := marker.cforward.FindAllIndex(aseq, match[0][1]) start, end, nerr ,matched = marker.cforward.BestMatch(aseq, end,-1)
if len(rmatch) > 0 { if matched {
direct, _ := sequence.Subsequence(rmatch[0][0], rmatch[0][1], false) direct, _ := sequence.Subsequence(start,end, false)
defer direct.Recycle() defer direct.Recycle()
direct = direct.ReverseComplement(true) direct = direct.ReverseComplement(true)
ftag, err := sequence.Subsequence(rmatch[0][1], rmatch[0][1]+marker.taglength, false) ftag, err := sequence.Subsequence(end,end+marker.taglength, false)
defer ftag.Recycle() defer ftag.Recycle()
sftag := "" sftag := ""
if err != nil { if err != nil {
@ -200,8 +199,8 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
m.ForwardMatch = direct.String() m.ForwardMatch = direct.String()
m.ForwardTag = sftag m.ForwardTag = sftag
m.ReverseMismatches = rmatch[0][2] m.ReverseMismatches = nerr
m.BarcodeEnd = rmatch[0][0] m.BarcodeEnd = start
sample, ok := marker.samples[TagPair{sftag, rtag}] sample, ok := marker.samples[TagPair{sftag, rtag}]

View File

@ -8,6 +8,7 @@ type _Options struct {
discardErrors bool discardErrors bool
unidentified string unidentified string
allowedMismatch int allowedMismatch int
allowsIndel bool
withProgressBar bool withProgressBar bool
parallelWorkers int parallelWorkers int
batchSize int batchSize int
@ -55,6 +56,14 @@ func OptionAllowedMismatches(count int) WithOption {
return f return f
} }
func OptionAllowedIndel(allowed bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.allowsIndel = allowed
})
return f
}
// OptionParallelWorkers sets how many search // OptionParallelWorkers sets how many search
// jobs will be run in parallel. // jobs will be run in parallel.
func OptionParallelWorkers(nworkers int) WithOption { func OptionParallelWorkers(nworkers int) WithOption {
@ -87,6 +96,10 @@ func (options Options) AllowedMismatch() int {
return options.pointer.allowedMismatch return options.pointer.allowedMismatch
} }
func (options Options) AllowsIndel() bool {
return options.pointer.allowsIndel
}
func (options Options) WithProgressBar() bool { func (options Options) WithProgressBar() bool {
return options.pointer.withProgressBar return options.pointer.withProgressBar
} }
@ -110,6 +123,7 @@ func MakeOptions(setters []WithOption) Options {
discardErrors: true, discardErrors: true,
unidentified: "", unidentified: "",
allowedMismatch: 0, allowedMismatch: 0,
allowsIndel: false,
withProgressBar: false, withProgressBar: false,
parallelWorkers: 4, parallelWorkers: 4,
batchSize: 1000, batchSize: 1000,
@ -145,7 +159,7 @@ func ExtractBarcodeSlice(ngslibrary NGSLibrary,
opt := MakeOptions(options) opt := MakeOptions(options)
ngslibrary.Compile(opt.AllowedMismatch()) ngslibrary.Compile(opt.AllowedMismatch(),opt.AllowsIndel())
return _ExtractBarcodeSlice(ngslibrary, sequences, opt) return _ExtractBarcodeSlice(ngslibrary, sequences, opt)
} }
@ -155,7 +169,7 @@ func ExtractBarcodeSliceWorker(ngslibrary NGSLibrary,
opt := MakeOptions(options) opt := MakeOptions(options)
ngslibrary.Compile(opt.AllowedMismatch()) ngslibrary.Compile(opt.AllowedMismatch(),opt.AllowsIndel())
worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice { worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice {
return _ExtractBarcodeSlice(ngslibrary, sequences, opt) return _ExtractBarcodeSlice(ngslibrary, sequences, opt)

46
pkg/obiseq/compare.go Normal file
View File

@ -0,0 +1,46 @@
package obiseq
import (
"bytes"
)
type Compare func(a, b *BioSequence) int
func CompareSequence(a, b *BioSequence) int {
return bytes.Compare(a.sequence, b.sequence)
}
func CompareQuality(a, b *BioSequence) int {
return bytes.Compare(a.qualities, b.qualities)
}
// func CompareAttributeBuillder(key string) Compare {
// f := func(a, b *BioSequence) int {
// ak, oka := a.GetAttribute(key)
// bk, okb := b.GetAttribute(key)
// switch {
// case !oka && !okb:
// return 0
// case !oka:
// return -1
// case !okb:
// return +1
// }
// //av,oka := ak.(constraints.Ordered)
// //bv,okb := bk.(constraints.Ordered)
// switch {
// case !oka && !okb:
// return 0
// case !oka:
// return -1
// case !okb:
// return +1
// }
// }
// return f
// }

View File

@ -16,6 +16,7 @@ func IExtractBarcode(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error
opts = append(opts, opts = append(opts,
obingslibrary.OptionAllowedMismatches(CLIAllowedMismatch()), obingslibrary.OptionAllowedMismatches(CLIAllowedMismatch()),
obingslibrary.OptionAllowedIndel(CLIAllowsIndel()),
obingslibrary.OptionUnidentified(CLIUnidentifiedFileName()), obingslibrary.OptionUnidentified(CLIUnidentifiedFileName()),
obingslibrary.OptionDiscardErrors(!CLIConservedErrors()), obingslibrary.OptionDiscardErrors(!CLIConservedErrors()),
obingslibrary.OptionParallelWorkers(obioptions.CLIParallelWorkers()), obingslibrary.OptionParallelWorkers(obioptions.CLIParallelWorkers()),

View File

@ -13,6 +13,7 @@ import (
var _NGSFilterFile = "" var _NGSFilterFile = ""
var _UnidentifiedFile = "" var _UnidentifiedFile = ""
var _AllowedMismatch = int(2) var _AllowedMismatch = int(2)
var _AllowsIndel = false
var _ConservedError = false var _ConservedError = false
// PCROptionSet defines every options related to a simulated PCR. // PCROptionSet defines every options related to a simulated PCR.
@ -33,6 +34,9 @@ func MultiplexOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_ConservedError, "keep-errors", _ConservedError, options.BoolVar(&_ConservedError, "keep-errors", _ConservedError,
options.Description("Prints symbol counts.")) options.Description("Prints symbol counts."))
options.BoolVar(&_AllowsIndel, "with-indels", _AllowsIndel,
options.Description("Allows for indels during the primers matching."))
options.StringVar(&_UnidentifiedFile, "unidentified", _UnidentifiedFile, options.StringVar(&_UnidentifiedFile, "unidentified", _UnidentifiedFile,
options.Alias("u"), options.Alias("u"),
options.Description("Filename used to store the sequences unassigned to any sample.")) options.Description("Filename used to store the sequences unassigned to any sample."))
@ -52,6 +56,9 @@ func CLIAllowedMismatch() int {
return _AllowedMismatch return _AllowedMismatch
} }
func CLIAllowsIndel() bool {
return _AllowsIndel
}
func CLIUnidentifiedFileName() string { func CLIUnidentifiedFileName() string {
return _UnidentifiedFile return _UnidentifiedFile
} }

View File

@ -59,7 +59,7 @@ func OptionSet(options *getoptions.GetOpt) {
// ForwardPrimer returns the sequence of the forward primer as indicated by the // ForwardPrimer returns the sequence of the forward primer as indicated by the
// --forward command line option // --forward command line option
func ForwardPrimer() string { func ForwardPrimer() string {
pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch) pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch, false)
if err != nil { if err != nil {
log.Fatalf("%+v", err) log.Fatalf("%+v", err)
@ -73,7 +73,7 @@ func ForwardPrimer() string {
// ReversePrimer returns the sequence of the reverse primer as indicated by the // ReversePrimer returns the sequence of the reverse primer as indicated by the
// --reverse command line option // --reverse command line option
func ReversePrimer() string { func ReversePrimer() string {
pattern, err := obiapat.MakeApatPattern(_ReversePrimer, _AllowedMismatch) pattern, err := obiapat.MakeApatPattern(_ReversePrimer, _AllowedMismatch,false)
if err != nil { if err != nil {
log.Fatalf("%+v", err) log.Fatalf("%+v", err)