diff --git a/pkg/obialign/fastlcs.go b/pkg/obialign/fastlcs.go index 50bd27d..54bb1bb 100644 --- a/pkg/obialign/fastlcs.go +++ b/pkg/obialign/fastlcs.go @@ -1,9 +1,9 @@ package obialign -import ( - "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" - "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" -) +// import ( +// "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" +// "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" +// ) const wsize = 16 const dwsize = wsize * 2 @@ -58,182 +58,182 @@ var _empty = encodeValues(0, 0, false) var _out = encodeValues(0, 30000, true) var _notavail = encodeValues(0, 30000, false) -func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { +// func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { - lA := seqA.Len() - lB := seqB.Len() +// lA := seqA.Len() +// lB := seqB.Len() - // Ensure that A is the longest - if lA < lB { - seqA, seqB = seqB, seqA - lA, lB = lB, lA - } +// // Ensure that A is the longest +// if lA < lB { +// seqA, seqB = seqB, seqA +// lA, lB = lB, lA +// } - if maxError == -1 { - maxError = lA * 2 - } +// if maxError == -1 { +// maxError = lA * 2 +// } - delta := lA - lB +// delta := lA - lB - // The difference of length is larger the maximum allowed errors - if delta > maxError { - return -1, -1 - } +// // The difference of length is larger the maximum allowed errors +// if delta > maxError { +// return -1, -1 +// } - // Doit-on vraiment diviser par deux ??? pas certain - extra := (maxError - delta) + 1 +// // Doit-on vraiment diviser par deux ??? pas certain +// extra := (maxError - delta) + 1 - even := 1 + delta + 2*extra - width := 2*even - 1 +// even := 1 + delta + 2*extra +// width := 2*even - 1 - if buffer == nil { - var local []uint64 - buffer = &local - } +// if buffer == nil { +// var local []uint64 +// buffer = &local +// } - if cap(*buffer) < 2*width { - *buffer = make([]uint64, 3*width) - } +// if cap(*buffer) < 2*width { +// *buffer = make([]uint64, 3*width) +// } - previous := (*buffer)[0:width] - current := (*buffer)[width:(2 * width)] +// previous := (*buffer)[0:width] +// current := (*buffer)[width:(2 * width)] - previous[extra] = _empty - previous[extra+even] = encodeValues(0, 1, false) - previous[extra+even-1] = encodeValues(0, 1, false) +// previous[extra] = _empty +// previous[extra+even] = encodeValues(0, 1, false) +// previous[extra+even-1] = encodeValues(0, 1, false) - N := lB + ((delta) >> 1) +// N := lB + ((delta) >> 1) - bA := seqA.Sequence() - bB := seqB.Sequence() +// bA := seqA.Sequence() +// bB := seqB.Sequence() - // log.Println("N = ", N) +// // log.Println("N = ", N) - for y := 1; y <= N; y++ { - // in_matrix := false - x1 := y - lB + extra - x2 := extra - y - xs := goutils.MaxInt(goutils.MaxInt(x1, x2), 0) +// for y := 1; y <= N; y++ { +// // in_matrix := false +// x1 := y - lB + extra +// x2 := extra - y +// xs := goutils.MaxInt(goutils.MaxInt(x1, x2), 0) - x1 = y + extra - x2 = lA + extra - y - xf := goutils.MinInt(goutils.MinInt(x1, x2), even-1) + 1 +// x1 = y + extra +// x2 = lA + extra - y +// xf := goutils.MinInt(goutils.MinInt(x1, x2), even-1) + 1 - for x := xs; x < xf; x++ { +// for x := xs; x < xf; x++ { - i := y - x + extra - j := y + x - extra +// i := y - x + extra +// j := y + x - extra - var Sdiag, Sleft, Sup uint64 +// var Sdiag, Sleft, Sup uint64 - switch { +// switch { - case i == 0: - Sup = _notavail - Sdiag = _notavail - Sleft = encodeValues(0, j-1, false) - case j == 0: - Sup = encodeValues(0, i-1, false) - Sdiag = _notavail - Sleft = _notavail - default: - Sdiag = previous[x] +// case i == 0: +// Sup = _notavail +// Sdiag = _notavail +// Sleft = encodeValues(0, j-1, false) +// case j == 0: +// Sup = encodeValues(0, i-1, false) +// Sdiag = _notavail +// Sleft = _notavail +// default: +// Sdiag = previous[x] - if bA[j-1] == bB[i-1] { - Sdiag = _incscore(Sdiag) - } +// if bA[j-1] == bB[i-1] { +// Sdiag = _incscore(Sdiag) +// } - if x < (even - 1) { - Sup = previous[x+even] - } else { - Sup = _out - } - if x > 0 { - Sleft = previous[x+even-1] - } else { - Sleft = _out - } - } +// if x < (even - 1) { +// Sup = previous[x+even] +// } else { +// Sup = _out +// } +// if x > 0 { +// Sleft = previous[x+even-1] +// } else { +// Sleft = _out +// } +// } - var score uint64 - switch { - case Sdiag >= Sup && Sdiag >= Sleft: - score = Sdiag - case Sup >= Sleft: - score = Sup - default: - score = Sleft - } +// var score uint64 +// switch { +// case Sdiag >= Sup && Sdiag >= Sleft: +// score = Sdiag +// case Sup >= Sleft: +// score = Sup +// default: +// score = Sleft +// } - if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) { - score = _setout(score) - } +// if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) { +// score = _setout(score) +// } - current[x] = _incpath(score) - } - // . 9 10 + 2 - 1 - x1 = y - lB + extra + even - x2 = extra - y + even - 1 - xs = goutils.MaxInt(goutils.MaxInt(x1, x2), even) +// current[x] = _incpath(score) +// } +// // . 9 10 + 2 - 1 +// x1 = y - lB + extra + even +// x2 = extra - y + even - 1 +// xs = goutils.MaxInt(goutils.MaxInt(x1, x2), even) - x1 = y + extra + even - x2 = lA + extra - y + even - 1 - xf = goutils.MinInt(goutils.MinInt(x1, x2), width-1) + 1 +// x1 = y + extra + even +// x2 = lA + extra - y + even - 1 +// xf = goutils.MinInt(goutils.MinInt(x1, x2), width-1) + 1 - for x := xs; x < xf; x++ { +// for x := xs; x < xf; x++ { - i := y - x + extra + even - j := y + x - extra - even + 1 +// i := y - x + extra + even +// j := y + x - extra - even + 1 - var Sdiag, Sleft, Sup uint64 +// var Sdiag, Sleft, Sup uint64 - switch { +// switch { - case i == 0: - Sup = _notavail - Sdiag = _notavail - Sleft = encodeValues(0, j-1, false) - case j == 0: - Sup = encodeValues(0, i-1, false) - Sdiag = _notavail - Sleft = _notavail - default: - Sdiag = previous[x] - if bA[j-1] == bB[i-1] { - Sdiag = _incscore(Sdiag) - } +// case i == 0: +// Sup = _notavail +// Sdiag = _notavail +// Sleft = encodeValues(0, j-1, false) +// case j == 0: +// Sup = encodeValues(0, i-1, false) +// Sdiag = _notavail +// Sleft = _notavail +// default: +// Sdiag = previous[x] +// if bA[j-1] == bB[i-1] { +// Sdiag = _incscore(Sdiag) +// } - Sleft = current[x-even] - Sup = current[x-even+1] +// Sleft = current[x-even] +// Sup = current[x-even+1] - } +// } - var score uint64 - switch { - case Sdiag >= Sup && Sdiag >= Sleft: - score = Sdiag - case Sup >= Sleft: - score = Sup - default: - score = Sleft - } +// var score uint64 +// switch { +// case Sdiag >= Sup && Sdiag >= Sleft: +// score = Sdiag +// case Sup >= Sleft: +// score = Sup +// default: +// score = Sleft +// } - if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) { - score = _setout(score) - } +// if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) { +// score = _setout(score) +// } - current[x] = _incpath(score) - } +// current[x] = _incpath(score) +// } - previous, current = current, previous +// previous, current = current, previous - } +// } - s, l, o := decodeValues(previous[(delta%2)*even+extra+(delta>>1)]) +// s, l, o := decodeValues(previous[(delta%2)*even+extra+(delta>>1)]) - if o { - return -1, -1 - } +// if o { +// return -1, -1 +// } - return s, l -} +// return s, l +// } diff --git a/pkg/obialign/fastlcsegf.go b/pkg/obialign/fastlcsegf.go new file mode 100644 index 0000000..e441134 --- /dev/null +++ b/pkg/obialign/fastlcsegf.go @@ -0,0 +1,319 @@ +package obialign + +import ( + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + log "github.com/sirupsen/logrus" +) + +var _iupac = [26]byte{ + // a b c d e f + 1, 14, 2, 13, 0, 0, + // g h i j k l + 4, 11, 0, 0, 12, 0, + // m n o p q r + 3, 15, 0, 0, 0, 5, + // s t u v w x + 6, 8, 8, 13, 9, 0, + // y z + 10, 0, +} + +func _samenuc(a, b byte) bool { + if (a >= 'A') && (a <= 'Z') { + a |= 32 + } + if (b >= 'A') && (b <= 'Z') { + b |= 32 + } + + if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') { + return (_iupac[a-'a'] & _iupac[b-'a']) > 0 + } + return a == b +} +func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int) { + + lA := len(bA) + lB := len(bB) + + // Ensure that A is the longest + if lA < lB { + bA, bB = bB, bA + lA, lB = lB, lA + } + + if maxError == -1 { + maxError = lA * 2 + } + + delta := lA - lB + + // The difference of length is larger the maximum allowed errors + if delta > maxError { + return -1, -1 + } + + // // BEGINNING OF DEBUG CODE // + // debug_scores := make([][]int, lB+1) + // for i := range debug_scores { + // debug_scores[i] = make([]int, lA+1) + // } + + // debug_path := make([][]int, lB+1) + // for i := range debug_path { + // debug_path[i] = make([]int, lA+1) + // } + + // debug_out := make([][]bool, lB+1) + // for i := range debug_out { + // debug_out[i] = make([]bool, lA+1) + // } + // // END OF DEBUG CODE // + + // Doit-on vraiment diviser par deux ??? pas certain + extra := (maxError - delta) + 1 + + even := 1 + delta + 2*extra + width := 2*even - 1 + + if buffer == nil { + var local []uint64 + buffer = &local + } + + if cap(*buffer) < 2*width { + *buffer = make([]uint64, 3*width) + } + + previous := (*buffer)[0:width] + current := (*buffer)[width:(2 * width)] + + previous[extra] = _empty // Initialise cell 0,0 + if endgapfree { // Initialise cell 0,1 + previous[extra+even] = encodeValues(0, 0, false) + } else { + previous[extra+even] = encodeValues(0, 1, false) + } + previous[extra+even-1] = encodeValues(0, 1, false) // Initialise cell 1,0 + + N := lB + ((delta) >> 1) + + log.Debugln("N = ", N, " delta = ", delta, " extra = ", extra, " maxError = ", maxError) + + for y := 1; y <= N; y++ { + // in_matrix := false + x1 := y - lB + extra + x2 := extra - y + xs := goutils.MaxInt(goutils.MaxInt(x1, x2), 0) + + x1 = y + extra + x2 = lA + extra - y + xf := goutils.MinInt(goutils.MinInt(x1, x2), even-1) + 1 + + for x := xs; x < xf; x++ { + + // i span along B and j along A + i := y - x + extra + j := y + x - extra + + // log.Debugln("Coord : ", i, j) + var Sdiag, Sleft, Sup uint64 + + switch { + + case i == 0: + Sup = _notavail + Sdiag = _notavail + if endgapfree { + Sleft = encodeValues(0, 0, false) + } else { + Sleft = encodeValues(0, j, false) + } + case j == 0: + Sup = encodeValues(0, i, false) + Sdiag = _notavail + Sleft = _notavail + default: + Sdiag = _incpath(previous[x]) + if _samenuc(bA[j-1], bB[i-1]) { + Sdiag = _incscore(Sdiag) + } + + if x < (even - 1) { + Sup = _incpath(previous[x+even]) + } else { + Sup = _out + } + if x > 0 { + Sleft = previous[x+even-1] + if (i > 0 && i < lB) || !endgapfree { + Sleft = _incpath(Sleft) + } + } else { + Sleft = _out + } + } + + var score uint64 + switch { + case Sdiag >= Sup && Sdiag >= Sleft: + score = Sdiag + case Sup >= Sleft: + score = Sup + default: + score = Sleft + } + + // I supose the bug was here + // if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) { + // score = _setout(score) + // } + + // // BEGINNING OF DEBUG CODE // + // if i < 2 && j < 5 { + // log.Debugf("[%d,%d]\n",i,j) + // s, p, o := decodeValues(Sdiag) + // log.Debugf("+Sdiag (%v) : %d, %d, %v\n",Sdiag,s,p,o) + // s, p, o = decodeValues(Sup) + // log.Debugf("+Sup (%v) : %d, %d, %v\n",Sup,s,p,o) + // s, p, o = decodeValues(Sleft) + // log.Debugf("+Sleft (%v) : %d, %d, %v\n",Sleft,s,p,o) + // s, p, o = decodeValues(score) + // log.Debugf("+score (%v) : %d, %d, %v\n",score,s,p,o) + // log.Debugln("-------------------") + // } + // s, p, o := decodeValues(score) + // debug_scores[i][j] = s + // debug_path[i][j] = p + // debug_out[i][j] = o + // // END OF DEBUG CODE // + + current[x] = score + } + // . 9 10 + 2 - 1 + x1 = y - lB + extra + even + x2 = extra - y + even - 1 + xs = goutils.MaxInt(goutils.MaxInt(x1, x2), even) + + x1 = y + extra + even + x2 = lA + extra - y + even - 1 + xf = goutils.MinInt(goutils.MinInt(x1, x2), width-1) + 1 + + for x := xs; x < xf; x++ { + + i := y - x + extra + even + j := y + x - extra - even + 1 + + //log.Debugln("Coord : ", i, j) + var Sdiag, Sleft, Sup uint64 + + switch { + + case i == 0: + Sup = _notavail + Sdiag = _notavail + if endgapfree { + Sleft = encodeValues(0, 0, false) + } else { + Sleft = encodeValues(0, j, false) + } + case j == 0: + Sup = encodeValues(0, i, false) + Sdiag = _notavail + Sleft = _notavail + default: + Sdiag = _incpath(previous[x]) + if _samenuc(bA[j-1], bB[i-1]) { + Sdiag = _incscore(Sdiag) + } + + Sleft = current[x-even] + if (i > 0 && i < lB) || !endgapfree { + Sleft = _incpath(Sleft) + } + Sup = _incpath(current[x-even+1]) + + } + + var score uint64 + switch { + case Sdiag >= Sup && Sdiag >= Sleft: + score = Sdiag + case Sup >= Sleft: + score = Sup + default: + score = Sleft + } + + // I supose the bug was here + // if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) { + // score = _setout(score) + // } + + // // BEGINNING OF DEBUG CODE // + // if i < 2 && j < 5 { + // log.Debugf("[%d,%d]\n",i,j) + // s, p, o := decodeValues(Sdiag) + // log.Debugf("-Sdiag (%v) : %d, %d, %v\n",Sdiag,s,p,o) + // s, p, o = decodeValues(Sup) + // log.Debugf("-Sup (%v) : %d, %d, %v\n",Sup,s,p,o) + // s, p, o = decodeValues(Sleft) + // log.Debugf("-Sleft (%v) : %d, %d, %v\n",Sleft,s,p,o) + // s, p, o = decodeValues(score) + // log.Debugf("-score (%v) : %d, %d, %v\n",score,s,p,o) + // log.Debugln("-------------------") + // } + // s, p, o := decodeValues(score) + // debug_scores[i][j] = s + // debug_path[i][j] = p + // debug_out[i][j] = o + // // END OF DEBUG CODE // + current[x] = score + } + + previous, current = current, previous + + } + + s, l, o := decodeValues(previous[(delta%2)*even+extra+(delta>>1)]) + + // // BEGINNING OF DEBUG CODE // + // fmt.Printf("%2c\t", ' ') + // for j := 0; j <= lA; j++ { + // if j > 0 { + // fmt.Printf("%11c\t", bA[j-1]) + // } else { + // fmt.Printf("%11c\t", '-') + // } + // } + // fmt.Printf("\n") + // for i := 0; i <= lB; i++ { + // if i > 0 { + // fmt.Printf("%2c\t", bB[i-1]) + // } else { + // fmt.Printf("%2c\t", '-') + // } + + // for j := 0; j <= lA; j++ { + // fmt.Printf("%2d:%2d:%v\t", debug_scores[i][j], + // debug_path[i][j], debug_out[i][j]) + // } + // fmt.Printf("\n") + // } + // // end OF DEBUG CODE // + + if o { + return -1, -1 + } + + return s, l +} + +func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { + return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer) +} + +func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { + return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer) +} diff --git a/pkg/obiapat/apat.h b/pkg/obiapat/apat.h index 32bae70..c536523 100644 --- a/pkg/obiapat/apat.h +++ b/pkg/obiapat/apat.h @@ -151,10 +151,7 @@ int32_t ManberNoErr (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t beg int32_t ManberSub (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length); int32_t ManberIndel (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length); int32_t ManberAll (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length); -int32_t NwsPatAlign (Seq *pseq , Pattern *ppat, int32_t nerr , - int32_t *reslen , int32_t *reserr); - - /* apat_sys.c */ +int32_t NwsPatAlign (Seq *pseq , Pattern *ppat, int32_t nerr, int32_t begin, int32_t *reslen, int32_t *reserr); /* apat_sys.c */ float UserCpuTime (int32_t reset); float SysCpuTime (int32_t reset); diff --git a/pkg/obiapat/apat_search.c b/pkg/obiapat/apat_search.c index c4ef5a9..7781319 100644 --- a/pkg/obiapat/apat_search.c +++ b/pkg/obiapat/apat_search.c @@ -23,7 +23,8 @@ #define TOPCURS CursiToTop #define DOWNREAD ReadiDown -#define KRONECK(x, msk) ((~x & msk) ? 0 : 1) +//#define KRONECK(x, msk) ((~x & msk) ? 0 : 1) +#define KRONECK(x, msk) ((x & msk) ? 0 : 1) #define MIN(x, y) ((x) < (y) ? (x) : (y)) /* -------------------------------------------- */ @@ -192,8 +193,8 @@ int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) { int e, emax, found; uint32_t pos; - uint32_t smask, cmask, sindx; - uint32_t *pr, r[2 * MAX_PAT_ERR + 2]; + patword_t smask, cmask, sindx; + patword_t *pr, r[2 * MAX_PAT_ERR + 2]; uint8_t *data; StackiPtr *stkpos, *stkerr; uint32_t end; @@ -220,6 +221,9 @@ int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) stkpos = pseq->hitpos + patnum; stkerr = pseq->hiterr + patnum; + EmptyStacki(stkpos[0]); + EmptyStacki(stkerr[0]); + /* loop on text data */ for (pos = begin ; pos < end ; pos++) { @@ -256,12 +260,14 @@ int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) /* -------------------------------------------- */ int32_t ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) { - if (ppat->maxerr == 0) - return ManberNoErr(pseq, ppat, patnum, begin, length); - else if (ppat->hasIndel) + if (ppat->maxerr == 0){ + return ManberNoErr(pseq, ppat, patnum, begin, length);} + else if (ppat->hasIndel) { return ManberIndel(pseq, ppat, patnum, begin, length); - else + } + else { return ManberSub(pseq, ppat, patnum, begin, length); + } } @@ -271,11 +277,9 @@ int32_t ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length) /* (avec substitution obligatoire aux bords) */ /* -------------------------------------------- */ -int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr) - Seq *pseq; - Pattern *ppat; - int32_t nerr, *reslen, *reserr; -{ +int32_t NwsPatAlign(Seq *pseq, Pattern *ppat, + int32_t nerr, int32_t begin, + int32_t *reslen, int32_t *reserr) { uint8_t *sseq, *px; int32_t i, j, lseq, lpat, npos, dindel, dsub, *pc, *pi, *pd, *ps; @@ -283,7 +287,9 @@ int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr) static int32_t sTab[(MAX_PAT_LEN+MAX_PAT_ERR+1) * (MAX_PAT_LEN+1)]; - lseq = pseq->seqlen; + lpat = ppat->patlen; + lseq = MIN(lpat + MAX_PAT_ERR+1, pseq->seqlen - begin); + sseq = pseq->data + begin - 1; pc = sTab; /* |----|----| --> i */ pi = pc - 1; /* | ps | pd | | */ @@ -291,36 +297,39 @@ int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr) ps = pd - 1; /* | pi | pc | v j */ /* |---------| */ - lseq = pseq->seqlen; - lpat = ppat->patlen; - sseq = pseq->data - 1; - amask = ONEMASK >> lpat; + //amask = ONEMASK >> lpat; + amask = 0x1L << (ppat->patlen); for (j = 0 ; j <= lpat ; j++) { - + for (i = 0 , px = sseq ; i <= lseq ; i++, px++) { if (i && j) { dindel = MIN(*pi, *pd) + 1; + if (j == lpat) dindel--; dsub = *ps + KRONECK(ppat->smat[*px], amask); + // fprintf(stderr, "mismatch : %d %d %d %d\n",j,*px,KRONECK(ppat->smat[*px], amask),amask); *pc = MIN(dindel, dsub); } else if (i) /* j == 0 */ *pc = *pi + 1; else if (j) /* i == 0 */ - *pc = *pd + 1; + *pc = *pd; else /* root */ *pc = 0; + fprintf(stderr," %02d",*pc); pc++; pi++; pd++; ps++; } + fprintf(stderr,"\n"); - amask <<= 1; + // amask <<= 1; + amask >>= 1; } pc--; @@ -331,6 +340,7 @@ int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr) *reserr++ = *pc; npos++; } + fprintf(stderr,"i=%d *pc = %d<%d, reserr = %d npos = %d\n",i,*pc,nerr,*(reserr-1),npos); } return npos; diff --git a/pkg/obiapat/obiapat.c b/pkg/obiapat/obiapat.c index 028a711..85f0c55 100644 --- a/pkg/obiapat/obiapat.c +++ b/pkg/obiapat/obiapat.c @@ -337,7 +337,7 @@ int32_t delete_apatseq(SeqPtr pseq, return 1; } -PatternPtr buildPattern(const char *pat, int32_t error_max, +PatternPtr buildPattern(const char *pat, int32_t error_max, uint8_t hasIndel, int *errno, char **errmsg) { PatternPtr pattern; @@ -355,7 +355,7 @@ PatternPtr buildPattern(const char *pat, int32_t error_max, errno,errmsg); pattern->ok = true; - pattern->hasIndel= false; + pattern->hasIndel= hasIndel; pattern->maxerr = error_max; pattern->cpat = (char*)pattern + sizeof(Pattern); diff --git a/pkg/obiapat/obiapat.h b/pkg/obiapat/obiapat.h index 65c4260..a10acab 100644 --- a/pkg/obiapat/obiapat.h +++ b/pkg/obiapat/obiapat.h @@ -118,7 +118,7 @@ ecoseq_t *new_ecoseq_with_data( char *AC, int32_t delete_apatseq(SeqPtr pseq, int *errno, char **errmsg); -PatternPtr buildPattern(const char *pat, int32_t error_max, int *errno, char **errmsg); +PatternPtr buildPattern(const char *pat, int32_t error_max, uint8_t hasIndel, int *errno, char **errmsg); PatternPtr complementPattern(PatternPtr pat, int *errno, char **errmsg); SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen, diff --git a/pkg/obiapat/pattern.go b/pkg/obiapat/pattern.go index 703b73e..efbf08b 100644 --- a/pkg/obiapat/pattern.go +++ b/pkg/obiapat/pattern.go @@ -13,6 +13,8 @@ import ( log "github.com/sirupsen/logrus" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" ) @@ -65,14 +67,20 @@ var NilApatSequence = ApatSequence{nil} // the errormax parameter. Some positions can be marked as not // allowed for mismatches. They have to be signaled using a '#' // sign after the corresponding nucleotide. -func MakeApatPattern(pattern string, errormax int) (ApatPattern, error) { +func MakeApatPattern(pattern string, errormax int, allowsIndel bool) (ApatPattern, error) { cpattern := C.CString(pattern) defer C.free(unsafe.Pointer(cpattern)) cerrormax := C.int32_t(errormax) + + callosindel := C.uint8_t(0) + if allowsIndel { + callosindel = C.uint8_t(1) + } + var errno C.int32_t var errmsg *C.char - apc := C.buildPattern(cpattern, cerrormax, &errno, &errmsg) + apc := C.buildPattern(cpattern, cerrormax, callosindel, &errno, &errmsg) if apc == nil { message := C.GoString(errmsg) @@ -281,16 +289,13 @@ func (sequence ApatSequence) Free() { // values of the [3]int indicate respectively the start and the end position of // the match. Following the GO convention the end position is not included in the // match. The third value indicates the number of error detected for this occurrence. -func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, limits ...int) (loc [][3]int) { - begin := 0 - length := sequence.Len() - - if len(limits) > 0 { - begin = limits[0] +func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, begin, length int) (loc [][3]int) { + if begin < 0 { + begin = 0 } - if len(limits) > 1 { - length = limits[1] + if length < 0 { + length = sequence.Len() } nhits := int(C.ManberAll(sequence.pointer.pointer, @@ -310,13 +315,70 @@ func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, limits ...int) (l for i := 0; i < nhits; i++ { start := int(stktmp[i]) err := int(errtmp[i]) - + log.Debugln(C.GoString(pattern.pointer.pointer.cpat), start, err) loc = append(loc, [3]int{start, start + patlen, err}) } + log.Debugln("------------") return loc } +func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (start int, end int, nerr int, matched bool) { + res := pattern.FindAllIndex(sequence, begin, length) + + sbuffer := [(int(C.MAX_PAT_LEN) + int(C.MAX_PAT_ERR) + 1) * (int(C.MAX_PAT_LEN) + 1)]uint64{} + buffer := sbuffer[:] + + if len(res) == 0 { + matched = false + return + } + + matched = true + + best := [3]int{0, 0, 10000} + for _, m := range res { + if m[2] < best[2] { + best = m + log.Debugln(best) + } + } + + nerr = best[2] + end = best[1] + + if nerr == 0 || !pattern.pointer.pointer.hasIndel { + start = best[0] + log.Debugln("No nws ", start, nerr) + return + } + + start = best[0] - nerr + end = best[0] + int(pattern.pointer.pointer.patlen) + nerr + start = goutils.MaxInt(start, 0) + end = goutils.MinInt(end, sequence.Len()) + + cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat)) + cseq := (*[1 << 30]byte)(unsafe.Pointer(sequence.pointer.pointer.cseq)) + + log.Debugln( + string((*cseq)[start:end]), + string((*cpattern)[0:int(pattern.pointer.pointer.patlen)]), + best[0], nerr, int(pattern.pointer.pointer.patlen), + sequence.Len(), start, end) + + score, lali := obialign.FastLCSEGFScoreByte( + (*cseq)[start:end], + (*cpattern)[0:int(pattern.pointer.pointer.patlen)], + nerr*2, true, &buffer) + + nerr = lali - score + start = best[0] + int(pattern.pointer.pointer.patlen) - lali + log.Println("results", score, lali, start, nerr) + return +} + +// tagaacaggctcctctag // func AllocatedApaSequences() int { // return int(_AllocatedApaSequences) // } diff --git a/pkg/obiapat/pcr.go b/pkg/obiapat/pcr.go index c87f879..864f6e3 100644 --- a/pkg/obiapat/pcr.go +++ b/pkg/obiapat/pcr.go @@ -133,7 +133,7 @@ func OptionForwardPrimer(primer string, max int) WithOption { f := WithOption(func(opt Options) { var err error - opt.pointer.forward, err = MakeApatPattern(primer, max) + opt.pointer.forward, err = MakeApatPattern(primer, max, false) if err != nil { log.Fatalf("error : %v\n", err) } @@ -155,7 +155,7 @@ func OptionReversePrimer(primer string, max int) WithOption { f := WithOption(func(opt Options) { var err error - opt.pointer.reverse, err = MakeApatPattern(primer, max) + opt.pointer.reverse, err = MakeApatPattern(primer, max, false) if err != nil { log.Fatalf("error : %v\n", err) } @@ -210,7 +210,7 @@ func _Pcr(seq ApatSequence, reverse := opt.pointer.reverse crev := opt.pointer.crev - forwardMatches := forward.FindAllIndex(seq) + forwardMatches := forward.FindAllIndex(seq,0,-1) if len(forwardMatches) > 0 { @@ -284,7 +284,7 @@ func _Pcr(seq ApatSequence, } } - forwardMatches = reverse.FindAllIndex(seq) + forwardMatches = reverse.FindAllIndex(seq,0,-1) if forwardMatches != nil { begin := forwardMatches[0][0] diff --git a/pkg/obingslibrary/match.go b/pkg/obingslibrary/match.go index 7afd418..3785d42 100644 --- a/pkg/obingslibrary/match.go +++ b/pkg/obingslibrary/match.go @@ -27,11 +27,11 @@ type DemultiplexMatch struct { Error error } -func (library *NGSLibrary) Compile(maxError int) error { +func (library *NGSLibrary) Compile(maxError int, allowsIndel bool) error { for primers, marker := range *library { err := marker.Compile(primers.Forward, primers.Reverse, - maxError) + maxError, allowsIndel) if err != nil { return err } @@ -57,15 +57,13 @@ func (library *NGSLibrary) ExtractBarcode(sequence *obiseq.BioSequence, inplace return match.ExtractBarcode(sequence, inplace) } -func (marker *Marker) Compile(forward, reverse string, maxError int) error { +func (marker *Marker) Compile(forward, reverse string, maxError int, allowsIndel bool) error { var err error - marker.forward, err = obiapat.MakeApatPattern(forward, - maxError) + marker.forward, err = obiapat.MakeApatPattern(forward, maxError, allowsIndel) if err != nil { return err } - marker.reverse, err = obiapat.MakeApatPattern(reverse, - maxError) + marker.reverse, err = obiapat.MakeApatPattern(reverse, maxError, allowsIndel) if err != nil { return err } @@ -105,33 +103,34 @@ func (marker *Marker) Compile(forward, reverse string, maxError int) error { return nil } + + func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { aseq, _ := obiapat.MakeApatSequence(sequence, false) - match := marker.forward.FindAllIndex(aseq, marker.taglength) - - if len(match) > 0 { + start, end, nerr ,matched := marker.forward.BestMatch(aseq, marker.taglength,-1) + if matched { sseq := sequence.String() - direct := sseq[match[0][0]:match[0][1]] - ftag := strings.ToLower(sseq[(match[0][0] - marker.taglength):match[0][0]]) + direct := sseq[start:end] + ftag := strings.ToLower(sseq[(start - marker.taglength):start]) m := DemultiplexMatch{ ForwardMatch: direct, ForwardTag: ftag, - BarcodeStart: match[0][1], - ForwardMismatches: match[0][2], + BarcodeStart: end, + ForwardMismatches: nerr, IsDirect: true, Error: nil, } - rmatch := marker.creverse.FindAllIndex(aseq, match[0][1]) + start, end, nerr ,matched = marker.creverse.BestMatch(aseq, start,-1) - if len(rmatch) > 0 { + if matched { // extracting primer matches - reverse, _ := sequence.Subsequence(rmatch[0][0], rmatch[0][1], false) + reverse, _ := sequence.Subsequence(start,end, false) defer reverse.Recycle() reverse = reverse.ReverseComplement(true) - rtag, err := sequence.Subsequence(rmatch[0][1], rmatch[0][1]+marker.taglength, false) + rtag, err := sequence.Subsequence(end, end+marker.taglength, false) defer rtag.Recycle() srtag := "" @@ -143,8 +142,8 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { } m.ReverseMatch = strings.ToLower(reverse.String()) - m.ReverseMismatches = rmatch[0][2] - m.BarcodeEnd = rmatch[0][0] + m.ReverseMismatches = nerr + m.BarcodeEnd = start m.ReverseTag = srtag sample, ok := marker.samples[TagPair{ftag, srtag}] @@ -162,32 +161,32 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { return &m } - match = marker.reverse.FindAllIndex(aseq, marker.taglength) + start, end, nerr ,matched = marker.reverse.BestMatch(aseq, marker.taglength,-1) - if len(match) > 0 { + if matched { sseq := sequence.String() - reverse := strings.ToLower(sseq[match[0][0]:match[0][1]]) - rtag := strings.ToLower(sseq[(match[0][0] - marker.taglength):match[0][0]]) + reverse := strings.ToLower(sseq[start:end]) + rtag := strings.ToLower(sseq[(start - marker.taglength):start]) m := DemultiplexMatch{ ReverseMatch: reverse, ReverseTag: rtag, - BarcodeStart: match[0][1], - ReverseMismatches: match[0][2], + BarcodeStart: end, + ReverseMismatches: nerr, IsDirect: false, Error: nil, } - rmatch := marker.cforward.FindAllIndex(aseq, match[0][1]) + start, end, nerr ,matched = marker.cforward.BestMatch(aseq, end,-1) - if len(rmatch) > 0 { + if matched { - direct, _ := sequence.Subsequence(rmatch[0][0], rmatch[0][1], false) + direct, _ := sequence.Subsequence(start,end, false) defer direct.Recycle() direct = direct.ReverseComplement(true) - ftag, err := sequence.Subsequence(rmatch[0][1], rmatch[0][1]+marker.taglength, false) + ftag, err := sequence.Subsequence(end,end+marker.taglength, false) defer ftag.Recycle() sftag := "" if err != nil { @@ -200,8 +199,8 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { m.ForwardMatch = direct.String() m.ForwardTag = sftag - m.ReverseMismatches = rmatch[0][2] - m.BarcodeEnd = rmatch[0][0] + m.ReverseMismatches = nerr + m.BarcodeEnd = start sample, ok := marker.samples[TagPair{sftag, rtag}] diff --git a/pkg/obingslibrary/worker.go b/pkg/obingslibrary/worker.go index df9ad26..0ab887d 100644 --- a/pkg/obingslibrary/worker.go +++ b/pkg/obingslibrary/worker.go @@ -8,6 +8,7 @@ type _Options struct { discardErrors bool unidentified string allowedMismatch int + allowsIndel bool withProgressBar bool parallelWorkers int batchSize int @@ -55,6 +56,14 @@ func OptionAllowedMismatches(count int) WithOption { return f } +func OptionAllowedIndel(allowed bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.allowsIndel = allowed + }) + + return f +} + // OptionParallelWorkers sets how many search // jobs will be run in parallel. func OptionParallelWorkers(nworkers int) WithOption { @@ -87,6 +96,10 @@ func (options Options) AllowedMismatch() int { return options.pointer.allowedMismatch } +func (options Options) AllowsIndel() bool { + return options.pointer.allowsIndel +} + func (options Options) WithProgressBar() bool { return options.pointer.withProgressBar } @@ -110,6 +123,7 @@ func MakeOptions(setters []WithOption) Options { discardErrors: true, unidentified: "", allowedMismatch: 0, + allowsIndel: false, withProgressBar: false, parallelWorkers: 4, batchSize: 1000, @@ -145,7 +159,7 @@ func ExtractBarcodeSlice(ngslibrary NGSLibrary, opt := MakeOptions(options) - ngslibrary.Compile(opt.AllowedMismatch()) + ngslibrary.Compile(opt.AllowedMismatch(),opt.AllowsIndel()) return _ExtractBarcodeSlice(ngslibrary, sequences, opt) } @@ -155,7 +169,7 @@ func ExtractBarcodeSliceWorker(ngslibrary NGSLibrary, opt := MakeOptions(options) - ngslibrary.Compile(opt.AllowedMismatch()) + ngslibrary.Compile(opt.AllowedMismatch(),opt.AllowsIndel()) worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice { return _ExtractBarcodeSlice(ngslibrary, sequences, opt) diff --git a/pkg/obiseq/compare.go b/pkg/obiseq/compare.go new file mode 100644 index 0000000..0ad1c66 --- /dev/null +++ b/pkg/obiseq/compare.go @@ -0,0 +1,46 @@ +package obiseq + +import ( + "bytes" +) + +type Compare func(a, b *BioSequence) int + +func CompareSequence(a, b *BioSequence) int { + return bytes.Compare(a.sequence, b.sequence) +} + +func CompareQuality(a, b *BioSequence) int { + return bytes.Compare(a.qualities, b.qualities) +} + +// func CompareAttributeBuillder(key string) Compare { +// f := func(a, b *BioSequence) int { +// ak, oka := a.GetAttribute(key) +// bk, okb := b.GetAttribute(key) + +// switch { +// case !oka && !okb: +// return 0 +// case !oka: +// return -1 +// case !okb: +// return +1 +// } + +// //av,oka := ak.(constraints.Ordered) +// //bv,okb := bk.(constraints.Ordered) + +// switch { +// case !oka && !okb: +// return 0 +// case !oka: +// return -1 +// case !okb: +// return +1 +// } + +// } + +// return f +// } diff --git a/pkg/obitools/obimultiplex/demultiplex.go b/pkg/obitools/obimultiplex/demultiplex.go index 69b8a9c..14d6beb 100644 --- a/pkg/obitools/obimultiplex/demultiplex.go +++ b/pkg/obitools/obimultiplex/demultiplex.go @@ -16,6 +16,7 @@ func IExtractBarcode(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error opts = append(opts, obingslibrary.OptionAllowedMismatches(CLIAllowedMismatch()), + obingslibrary.OptionAllowedIndel(CLIAllowsIndel()), obingslibrary.OptionUnidentified(CLIUnidentifiedFileName()), obingslibrary.OptionDiscardErrors(!CLIConservedErrors()), obingslibrary.OptionParallelWorkers(obioptions.CLIParallelWorkers()), diff --git a/pkg/obitools/obimultiplex/options.go b/pkg/obitools/obimultiplex/options.go index 37ffd80..f0e2676 100644 --- a/pkg/obitools/obimultiplex/options.go +++ b/pkg/obitools/obimultiplex/options.go @@ -13,6 +13,7 @@ import ( var _NGSFilterFile = "" var _UnidentifiedFile = "" var _AllowedMismatch = int(2) +var _AllowsIndel = false var _ConservedError = false // PCROptionSet defines every options related to a simulated PCR. @@ -33,6 +34,9 @@ func MultiplexOptionSet(options *getoptions.GetOpt) { options.BoolVar(&_ConservedError, "keep-errors", _ConservedError, options.Description("Prints symbol counts.")) + options.BoolVar(&_AllowsIndel, "with-indels", _AllowsIndel, + options.Description("Allows for indels during the primers matching.")) + options.StringVar(&_UnidentifiedFile, "unidentified", _UnidentifiedFile, options.Alias("u"), options.Description("Filename used to store the sequences unassigned to any sample.")) @@ -52,6 +56,9 @@ func CLIAllowedMismatch() int { return _AllowedMismatch } +func CLIAllowsIndel() bool { + return _AllowsIndel +} func CLIUnidentifiedFileName() string { return _UnidentifiedFile } diff --git a/pkg/obitools/obipcr/options.go b/pkg/obitools/obipcr/options.go index f6d60ea..0d5fc0a 100644 --- a/pkg/obitools/obipcr/options.go +++ b/pkg/obitools/obipcr/options.go @@ -59,7 +59,7 @@ func OptionSet(options *getoptions.GetOpt) { // ForwardPrimer returns the sequence of the forward primer as indicated by the // --forward command line option func ForwardPrimer() string { - pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch) + pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch, false) if err != nil { log.Fatalf("%+v", err) @@ -73,7 +73,7 @@ func ForwardPrimer() string { // ReversePrimer returns the sequence of the reverse primer as indicated by the // --reverse command line option func ReversePrimer() string { - pattern, err := obiapat.MakeApatPattern(_ReversePrimer, _AllowedMismatch) + pattern, err := obiapat.MakeApatPattern(_ReversePrimer, _AllowedMismatch,false) if err != nil { log.Fatalf("%+v", err)