diff --git a/cmd/test/main.go b/cmd/test/main.go index 59500cb..a39050f 100644 --- a/cmd/test/main.go +++ b/cmd/test/main.go @@ -1,15 +1,65 @@ package main import ( + "fmt" "log" "os" "runtime/trace" - "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" - "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiclean" - "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "cloudeng.io/algo/lcs" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" ) +func SESStat(script *lcs.EditScript[byte]) (int, int) { + llcs := 0 + gaps := 0 + extra := 0 + i := 0 + ls := len(*script) + for i < ls { + e := (*script)[i] + // log.Println(i,e,e.Op) + switch e.Op { + case lcs.Identical: // 2 + if gaps > 0 { + extra += gaps + } + llcs++ + gaps = 0 + i++ + case lcs.Delete: // 1 + if i+1 < ls { + en := (*script)[i+1] + if en.Op == lcs.Identical && e.Val == en.Val { + log.Println("Swap delete") + (*script)[i], (*script)[i+1] = (*script)[i+1], (*script)[i] + continue + } + } + gaps-- + i++ + case lcs.Insert: // 0 + if i+1 < ls { + en := (*script)[i+1] + if en.Op == lcs.Identical && e.Val == en.Val { + log.Println("Swap Insert") + (*script)[i], (*script)[i+1] = (*script)[i+1], (*script)[i] + continue + } + } + gaps++ + i++ + } + } + + if gaps > 0 { + extra += gaps + } + + return llcs, extra +} + func main() { // Creating a file called cpu.trace. @@ -20,15 +70,46 @@ func main() { trace.Start(ftrace) defer trace.Stop() - option_parser := obioptions.GenerateOptionParser( - obiconvert.InputOptionSet, - ) + // "---CACGATCGTGC-CAGTCAT-GGCTAT" + // "CCCCA-GATCGTGCG-AGTCATGGGCTAT" - _, args, _ := option_parser(os.Args) + // 00 0 00000000 1111111 111222 + // 01 2 34567889 0123456 789012 + // "CA-C-GATCGTGC-CAGTCAT-GGCTAT" + // "CCCCAGATCGTGCG-AGTCATGGGCTAT" - fs, _ := obiconvert.ReadBioSequencesBatch(args...) + //A := "CACGATCGTGCCCCCAGTCATGGCTAT" + A := "AAATGCCCCAGATCGTGC" + B := "TGCCCCAGAT" - obiclean.IOBIClean(fs) + //A = "aaaggaacttggactgaagatttccacagaggttgcgaatgaaaaacacgtattcgaatgcctcaaatacggaatcgatcttgtctg" + A = "aaaggaacttggactgaagatttccacagaggttgcgaatgaaaaacacgtattcgaatgcctcaaatacggaatcgatcttgtctg" + B = "atccggttttacgaaaatgcgtgtggtaggggcttatgaaaacgataatcgaataaaaaagggtaggtgcagagactcaacggaagatgttctaacaaatgg" + // A = "aataa" + // B = "ttttt" + sA := obiseq.NewBioSequence("A", []byte(A), "") + sB := obiseq.NewBioSequence("A", []byte(B), "") + // M := lcs.NewMyers([]byte(A), []byte(B)) + // fmt.Println(M.LCS()) + // fmt.Println(M.SES()) + // fmt.Println(len(M.LCS())) + // M.SES().FormatHorizontal(os.Stdout, []byte(B)) + // llcs, extra := SESStat(M.SES()) + // nlcs, nali := obialign.LCSScore(sA, sB, sB.Length(), nil) + // fmt.Println(llcs, extra, len(A)+extra) + // fmt.Println(nlcs, nali) + nlcs, nali := obialign.FastLCSScore(sA, sB, sB.Length(), nil) + fmt.Println(nlcs, nali) + + // option_parser := obioptions.GenerateOptionParser( + // obiconvert.InputOptionSet, + // ) + + // _, args, _ := option_parser(os.Args) + + // fs, _ := obiconvert.ReadBioSequencesBatch(args...) + + // obiclean.IOBIClean(fs) // buffer := make([]byte, 0) // fs.Next() diff --git a/pkg/obialign/fastlcs.go b/pkg/obialign/fastlcs.go index 86770ca..98c6c0f 100644 --- a/pkg/obialign/fastlcs.go +++ b/pkg/obialign/fastlcs.go @@ -1,29 +1,42 @@ package obialign import ( - "log" - + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" ) +const wsize = 16 +const dwsize = wsize * 2 + // Out values are always the smallest // Among in values, they rank according to their score // For equal score the shortest path is the best -func _encodeValues(score, length int, out bool) uint64 { - const mask = uint64(1<<16) - 1 +func encodeValues(score, length int, out bool) uint64 { + const mask = uint64(1<> 16) & mask) +func _isout(value uint64) bool { + const outmask = uint64(1) << dwsize + return (value & outmask) == 0 +} + +func _lpath(value uint64) int { + const mask = uint64(1<> wsize) & mask) length := int(((value + 1) ^ mask) & mask) - out := (value & (1 << 32)) == 0 + out := (value & outmask) == 0 return score, length, out } @@ -32,34 +45,20 @@ func _incpath(value uint64) uint64 { } func _incscore(value uint64) uint64 { - const incr = uint64(1) << 16 + const incr = uint64(1) << wsize return value + incr } func _setout(value uint64) uint64 { - const outmask = uint64(1) << 32 - return value | outmask + const outmask = ^(uint64(1) << dwsize) + return value & outmask } -func _isout(value uint64) bool { - const outmask = uint64(1) << 32 - return (value & outmask) == 0 -} +var _empty = encodeValues(0, 0, false) +var _out = encodeValues(0, 30000, true) +var _notavail = encodeValues(0, 30000, false) -var _empty = _encodeValues(0, 0, false) -var _out = _encodeValues(0, 3000, true) - -func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int) (int, int) { - - // x:=_encodeValues(12,0,false) - // xs,xl,xo := _decodeValues(x) - // log.Println(x,xs,xl,xo) - // x=_encodeValues(12,1,false) - // xs,xl,xo = _decodeValues(x) - // log.Println(x,xs,xl,xo) - // x=_encodeValues(12,2,false) - // xs,xl,xo = _decodeValues(x) - // log.Println(x,xs,xl,xo) +func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { lA := seqA.Length() lB := seqB.Length() @@ -74,103 +73,88 @@ func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int) (int, int) { // The difference of length is larger the maximum allowed errors if delta > maxError { - // log.Println("Too large difference of length") return -1, -1 } // Doit-on vraiment diviser par deux ??? pas certain - extra := ((maxError - delta) / 2) + 1 + extra := (maxError - delta) + 1 even := 1 + delta + 2*extra width := 2*even - 1 - previous := make([]uint64, width) - current := make([]uint64, width) - - // Initialise the first line - - for j := 0; j < width; j++ { - if (j == extra+even) || (j == extra+even-1) { - previous[j] = _encodeValues(0, 1, false) - } else { - previous[j] = _empty - } + if buffer == nil { + var local []uint64 + buffer = &local } + if cap(*buffer) < 2*width { + *buffer = make([]uint64, 3*width) + } + + previous := (*buffer)[0:width] + current := (*buffer)[width:(2 * width)] + + previous[extra] = _empty + previous[extra+even] = encodeValues(0, 1, false) + previous[extra+even-1] = encodeValues(0, 1, false) + N := lB + ((delta) >> 1) - X := width + + bA := seqA.Sequence() + bB := seqB.Sequence() // log.Println("N = ", N) for y := 1; y <= N; y++ { // in_matrix := false - for x := 0; x < width; x++ { - X = y * width + x - modulo := X % width - begin := X - modulo - quotien := begin / width + x1 := y - lB + extra + x2 := extra - y + xs := goutils.MaxInt(goutils.MaxInt(x1, x2), 0) - i := quotien + extra - modulo - j := quotien - extra + modulo + x1 = y + extra + x2 = lA + extra - y + xf := goutils.MinInt(goutils.MinInt(x1, x2), even-1) + 1 - if x >= even { - i += even - j += 1 - even - } + for x := xs; x < xf; x++ { - //log.Println(X, i, j, i < 0 || j < 0 || i > lB || j > lA) - - // We are out of tha alignement matrix - if i < 0 || j < 0 || i > lB || j > lA { - X++ - continue - } + i := y - x + extra + j := y + x - extra var Sdiag, Sleft, Sup uint64 switch { case i == 0: - Sup = _encodeValues(0, 30000, false) - Sdiag = _encodeValues(0, 30000, false) - Sleft = _encodeValues(0, j-1, false) + Sup = _notavail + Sdiag = _notavail + Sleft = encodeValues(0, j-1, false) case j == 0: - Sup = _encodeValues(0, i-1, false) - Sdiag = _encodeValues(0, 30000, false) - Sleft = _encodeValues(0, 30000, false) + Sup = encodeValues(0, i-1, false) + Sdiag = _notavail + Sleft = _notavail default: Sdiag = previous[x] - if x >= even { - Sleft = current[x-even] - Sup = current[x-even+1] + + if bA[j-1] == bB[i-1] { + Sdiag = _incscore(Sdiag) + } + + if x < (even - 1) { + Sup = previous[x+even] } else { - if x < (even - 1) { - Sup = previous[x+even] - } else { - Sup = _out - } - if x > 0 { - Sleft = previous[x+even-1] - } else { - Sleft = _out - } + Sup = _out + } + if x > 0 { + Sleft = previous[x+even-1] + } else { + Sleft = _out } } - // log.Println("scores @",i,j,": ",Sdiag,Sup,Sleft) - // ds,dl,ol := _decodeValues(Sdiag) - // log.Println(ds,dl,ol) - // ds,dl,ol = _decodeValues(Sup) - // log.Println(ds,dl,ol) - // ds,dl,ol = _decodeValues(Sleft) - // log.Println(ds,dl,ol) var score uint64 switch { case Sdiag >= Sup && Sdiag >= Sleft: score = Sdiag - if seqA.Sequence()[j-1] == seqB.Sequence()[i-1] { - score = _incscore(score) - } case Sup >= Sleft: score = Sup default: @@ -181,36 +165,72 @@ func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int) (int, int) { score = _setout(score) } - // if i < 5 && j < 5 { - // ds, dl, ol := _decodeValues(_incpath(score)) - // log.Println("@", i, j,":", ds, dl, ol) - // } + current[x] = _incpath(score) + } + // . 9 10 + 2 - 1 + x1 = y - lB + extra + even + x2 = extra - y + even - 1 + xs = goutils.MaxInt(goutils.MaxInt(x1, x2), even) + + x1 = y + extra + even + x2 = lA + extra - y + even - 1 + xf = goutils.MinInt(goutils.MinInt(x1, x2), width-1) + 1 + + for x := xs; x < xf; x++ { + + i := y - x + extra + even + j := y + x - extra - even + 1 + + var Sdiag, Sleft, Sup uint64 + + switch { + + case i == 0: + Sup = _notavail + Sdiag = _notavail + Sleft = encodeValues(0, j-1, false) + case j == 0: + Sup = encodeValues(0, i-1, false) + Sdiag = _notavail + Sleft = _notavail + default: + Sdiag = previous[x] + + if bA[j-1] == bB[i-1] { + Sdiag = _incscore(Sdiag) + } + + Sleft = current[x-even] + Sup = current[x-even+1] + + } + + var score uint64 + switch { + case Sdiag >= Sup && Sdiag >= Sleft: + score = Sdiag + case Sup >= Sleft: + score = Sup + default: + score = Sleft + } + + if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) { + score = _setout(score) + } current[x] = _incpath(score) - - // if i == lB && j == lA { - // s, l, o := _decodeValues(current[x]) - // log.Println("Results in ", x, y, "(", i, j, ") values : ", s, l, o) - // } - - X++ } - // if !in_matrix { - // log.Fatalln("Never entred in the matrix", y, "/", N) - // } + previous, current = current, previous } - s, l, o := _decodeValues(previous[(delta%2)*even+extra+(delta>>1)]) + s, l, o := decodeValues(previous[(delta%2)*even+extra+(delta>>1)]) if o { - log.Println("Too much error", s, l, (lA%2)*even+(lA-lB), lA, lB, width, even, N) return -1, -1 } return s, l } - -// width * j + modulo + width * extra - width * modulo= X -// i = (X - modulo)/width - modulo + extra diff --git a/pkg/obialign/lcs.go b/pkg/obialign/lcs.go deleted file mode 100644 index 72a9217..0000000 --- a/pkg/obialign/lcs.go +++ /dev/null @@ -1,240 +0,0 @@ -package obialign - -import ( - "log" - - "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" -) - -func max(a, b int) int { - if a > b { - return a - } - return b -} - -func min(a, b int) int { - if a < b { - return a - } - return b -} - -type LCSMatrix struct { - matrix []int16 // Score matrix - //lenght []int16 // Alignment length matrix - ll int // Length of the longest sequence - l int // Length of the shortest sequence - delta int // ll - l - extra int - w int -} - -func NewLCSMatrix(matrix *LCSMatrix, L, l int, maxError int) *LCSMatrix { - if matrix == nil { - matrix = &LCSMatrix{} - } - - if l > L { - log.Panicf("L (%d) must be greater or equal to l (%d)", L, l) - } - - delta := L - l - extra := ((maxError - delta) / 2) + 1 - - needed := (L * (1 + delta + 2*extra)) * 2 - - if needed > matrix.Cap() { - matrix.matrix = make([]int16, needed) - //matrix.lenght = make([]int16, needed) - } - - matrix.matrix = matrix.matrix[0:needed] - // matrix.lenght = matrix.lenght[:needed] - - matrix.ll = L - matrix.l = l - matrix.delta = delta - matrix.extra = extra - matrix.w = delta + 1 + 2*extra - - return matrix -} - -func (matrix *LCSMatrix) Cap() int { - return cap(matrix.matrix) -} - -func (matrix *LCSMatrix) Length() int { - return len(matrix.matrix) -} - -func (matrix *LCSMatrix) Get(i, j int) (int, int) { - ij := max(0, i-matrix.extra) - sj := min(i+matrix.delta+matrix.extra, matrix.ll) - - switch { - case i == 0: - return 0,j - case j == 0: - return 0,i - case j < ij || j > sj: - return -1, 30000 - default: - offset := (matrix.extra + matrix.w*(i-1) + (matrix.w-1)*(j-i)) * 2 - return int(matrix.matrix[offset]), int(matrix.matrix[offset+1]) - } -} - -func (matrix *LCSMatrix) GetNeibourgh(i, j int) ( - sd int, ld int, - sup int, lup int, - sleft int, lleft int) { - offset := matrix.extra + matrix.w*(j-2) + i - j - - it := i - 1 - jt := j - 1 - - ij0 := max(0, i-matrix.extra) - sj0 := min(i+matrix.delta+matrix.extra, matrix.ll) - ij1 := max(0, it-matrix.extra) - sj1 := min(it+matrix.delta+matrix.extra, matrix.ll) - - switch { - case i == 1: - sd = 0 - ld = jt - case j == 1: - sd = 0 - ld = it - case jt < ij1 || jt > sj1: - sd = -1 - ld = 30000 - default: - sd = int(matrix.matrix[offset*2]) - ld = int(matrix.matrix[offset*2+1]) - } - - offset++ - - - switch { - case i == 0: - sleft = 0 - lleft = jt - case j == 1: - sleft = 0 - lleft = i - case jt < ij0 || jt > sj0: - sleft = -1 - lleft = 30000 - default: - sleft = int(matrix.matrix[offset*2]) - lleft = int(matrix.matrix[offset*2+1]) - } - - offset += matrix.w - 2 - - switch { - case i == 1: - sup = 0 - lup = j - case j == 0: - sup = 0 - lup = it - case j < ij1 || j > sj1: - sup = -1 - lup = 30000 - default: - sup = int(matrix.matrix[offset*2]) - lup = int(matrix.matrix[offset*2+1]) - } - - return -} - -func (matrix *LCSMatrix) Set(i, j int, score, length int) { - ij := max(0, i-matrix.extra) - sj := min(i+matrix.delta+matrix.extra, matrix.ll) - - if i > 0 && j > 0 && j >= ij && j <= sj { - offset := (matrix.extra + matrix.w*(i-1) + (matrix.w-1)*(j-i)) * 2 - matrix.matrix[offset] = int16(score) - matrix.matrix[offset+1] = int16(length) - } -} - -// Computes the LCS between two DNA sequences and the length of the -// corresponding alignment -func LCSScore(seqA, seqB *obiseq.BioSequence, maxError int, - matrix *LCSMatrix) (int, int) { - - if seqA.Length() == 0 { - log.Fatal("Sequence A has a length of 0") - } - - if seqB.Length() == 0 { - log.Fatal("Sequence B has a length of 0") - } - // swapped := false - - if seqA.Length() < seqB.Length() { - seqA, seqB = seqB, seqA - // swapped = true - } - - if (seqA.Length() - seqB.Length()) > maxError { - return -1, -1 - } - - la := seqA.Length() - lb := seqB.Length() - sa := seqA.Sequence() - sb := seqB.Sequence() - - matrix = NewLCSMatrix(matrix, la, lb, maxError) - - for i := 1; i <= matrix.l; i++ { - ij := max(1, i-matrix.extra) - sj := min(i+matrix.delta+matrix.extra, matrix.ll) - for j := ij; j <= sj; j++ { - sd, ld, sup, lup, sleft, lleft := matrix.GetNeibourgh(i, j) - if i > lb { - log.Println("Error on seq B ", 1, matrix.l) - log.Println(i) - log.Println(seqB.Length(), "/", lb) - log.Println(string(sa)) - log.Fatalln(string(sb)) - } - if j > la { - log.Println("Error on seq A ", ij, sj) - log.Println(j) - log.Println(seqA.Length(), "/", la) - log.Println(string(sa)) - log.Fatalln(string(sb)) - } - if sb[i-1] == sa[j-1] { - sd++ - } - // sup, lup := matrix.Get(i-1, j) - // sleft, lleft := matrix.Get(i, j-1) - switch { - case sd >= sup && sd >= sleft: - matrix.Set(i, j, sd, ld+1) - case sup >= sleft && sup >= sd: - matrix.Set(i, j, sup, lup+1) - default: - matrix.Set(i, j, sleft, lleft+1) - } - } - } - - s, l := matrix.Get(lb, la) - - if (l - s) > maxError { - // log.Println(l,s,l-s,maxError) - return -1, -1 - } - - return int(s), int(l) -} diff --git a/pkg/obitools/obiclean/graph.go b/pkg/obitools/obiclean/graph.go index 533b4bc..4a7576a 100644 --- a/pkg/obitools/obiclean/graph.go +++ b/pkg/obitools/obiclean/graph.go @@ -360,14 +360,14 @@ func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int { nseq := len(*seqs) running := sync.WaitGroup{} - linePairs := func(matrix *obialign.LCSMatrix, i int) { + linePairs := func(matrix *[]uint64, i int) { son := (*seqs)[i] for j := i + 1; j < nseq; j++ { father := (*seqs)[j] d, _, _, _ := obialign.D1Or0(son.Sequence, father.Sequence) if d < 0 { - lcs, lali := obialign.LCSScore(son.Sequence, father.Sequence, + lcs, lali := obialign.FastLCSScore(son.Sequence, father.Sequence, step, matrix) d := (lali - lcs) @@ -385,9 +385,10 @@ func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int { // idxChan := make(chan [][]Ratio) ff := func() { - matrix := obialign.NewLCSMatrix(nil, 150, 150, step) + var matrix []uint64 + for i := range lineChan { - linePairs(matrix, i) + linePairs(&matrix, i) } running.Done() diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go index 1a65ce3..adcca21 100644 --- a/pkg/obitools/obitag/obitag.go +++ b/pkg/obitools/obitag/obitag.go @@ -22,34 +22,26 @@ func FindClosests(sequence *obiseq.BioSequence, refcounts []*obikmer.Table4mer, runExact bool) (obiseq.BioSequenceSlice, int, float64, string, []int) { - matrix := obialign.NewLCSMatrix(nil, - sequence.Length(), - sequence.Length(), - sequence.Length()) + var matrix []uint64 seqwords := obikmer.Count4Mer(sequence, nil, nil) cw := make([]int, len(refcounts)) for i, ref := range refcounts { cw[i] = obikmer.Common4Mer(seqwords, ref) - // if i < 50 { - // print(cw[i]) - // print(";") - // } } - // print("\n") o := goutils.ReverseIntOrder(cw) - mcw := 100000 - for _, i := range o { - if cw[i] < mcw { - mcw = cw[i] - } - if cw[i] > mcw { - log.Panicln("wrong order") - } - } + // mcw := 100000 + // for _, i := range o { + // if cw[i] < mcw { + // mcw = cw[i] + // } + // if cw[i] > mcw { + // log.Panicln("wrong order") + // } + // } bests := obiseq.MakeBioSequenceSlice() bests = append(bests, references[o[0]]) @@ -74,7 +66,8 @@ func FindClosests(sequence *obiseq.BioSequence, // log.Println(sequence.Id(),cw[j], maxe) if runExact || (atMost <= (maxe + 1)) { - lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix) + lcs, alilength := obialign.FastLCSScore(sequence, ref, maxe+1,&matrix) + // lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix) n++ if lcs == -1 { nf++