From cfddc7816199ef1f6bc9835a2732128fe6f4aa9a Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 25 Jan 2023 13:22:56 +0100 Subject: [PATCH] Patch an aligment bug on obipairing --- pkg/obialign/dnamatrix.go | 16 ++++---- pkg/obialign/pairedendalign.go | 37 ++++++++++++----- pkg/obiseq/biosequenceslice.go | 14 +++++-- pkg/obiseq/worker.go | 72 +++++++++++++++++++++++++++++++--- 4 files changed, 112 insertions(+), 27 deletions(-) diff --git a/pkg/obialign/dnamatrix.go b/pkg/obialign/dnamatrix.go index 74c7068..845692d 100644 --- a/pkg/obialign/dnamatrix.go +++ b/pkg/obialign/dnamatrix.go @@ -55,21 +55,21 @@ func _MatchScoreRatio(a, b byte) (float64, float64) { l2 := math.Log(2) l3 := math.Log(3) - l4 := math.Log(4) l10 := math.Log(10) - lE1 := -float64(a)/10*l10 - l4 - lE2 := -float64(b)/10*l10 - l4 - lO1 := math.Log1p(-math.Exp(lE1 + l3)) - lO2 := math.Log1p(-math.Exp(lE2 + l3)) + lalea := math.Log(30) // 1 /(change of the random model) + lE1 := -float64(a)/10*l10 - l3 // log proba of sequencing error on A/3 + lE2 := -float64(b)/10*l10 - l3 // log proba of sequencing error on B/3 + lO1 := math.Log1p(-math.Exp(lE1 + l3)) // log proba no being an error on A + lO2 := math.Log1p(-math.Exp(lE2 + l3)) // log proba no being an error on B lO1O2 := lO1 + lO2 lE1E2 := lE1 + lE2 lO1E2 := lO1 + lE2 lO2E1 := lO2 + lE1 - MM := _Logaddexp(lO1O2, lE1E2+l3) + l4 - Mm := _Logaddexp(_Logaddexp(lO1E2, lO2E1), lE1E2+l2) + l4 + MM := _Logaddexp(lO1O2, lE1E2+l3) // Proba match when match observed + Mm := _Logaddexp(_Logaddexp(lO1E2, lO2E1), lE1E2+l2) // Proba match when mismatch observed - return MM, Mm + return MM + lalea, Mm + lalea } func _InitNucPartMatch() { diff --git a/pkg/obialign/pairedendalign.go b/pkg/obialign/pairedendalign.go index 821c30b..68487a7 100644 --- a/pkg/obialign/pairedendalign.go +++ b/pkg/obialign/pairedendalign.go @@ -52,6 +52,7 @@ func _GetMatrix(matrix *[]int, lenA, a, b int) int { } func _GetMatrixFrom(matrix *[]int, lenA, a, b int) (int, int, int) { + // Formula rechecked on 01/25/2023 i := (b+1)*(lenA+1) + a j := i - lenA m := *matrix @@ -68,10 +69,15 @@ func _PairingScorePeAlign(baseA, qualA, baseB, qualB byte) int { return _NucScorePartMatchMismatch[qualA][qualB] default: return int(partMatch*float64(_NucScorePartMatchMatch[qualA][qualB]) + - (1-partMatch)*float64(_NucScorePartMatchMismatch[qualA][qualB]) + 0.5) + (1-partMatch)*float64(_NucScorePartMatchMismatch[qualA][qualB]) + + 0.5) } } +// Gaps at the beginning of B and at the end of A are free +// With A spanning over lines and B over columns +// - First column gap = 0 +// - Last line gaps = 0 func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64, scoreMatrix, pathMatrix *[]int) int { @@ -95,6 +101,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64, *scoreMatrix = (*scoreMatrix)[:needed] *pathMatrix = (*pathMatrix)[:needed] + // Sets the first position of the matrix with 0 score _SetMatrices(scoreMatrix, pathMatrix, la, -1, -1, 0, 0) // Fills the first column with score 0 @@ -102,17 +109,20 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64, _SetMatrices(scoreMatrix, pathMatrix, la, i, -1, 0, -1) } - la1 := la - 1 + la1 := la - 1 // Except the last line (gaps are free on it) for j := 0; j < lb; j++ { + // Fill the first line with scores corresponding to a set of gaps _SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gapPenalty, 1) for i := 0; i < la1; i++ { left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j) + diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j]) left += gapPenalty top += gapPenalty + switch { case diag > left && diag > top: _SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0) @@ -143,6 +153,10 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64, return _GetMatrix(scoreMatrix, la, la1, lb-1) } +// Gaps at the beginning of A and at the end of B are free +// With A spanning over lines and B over columns +// - First line gap = 0 +// - Last column gaps = 0 func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64, scoreMatrix, pathMatrix *[]int) int { @@ -166,17 +180,19 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64, *scoreMatrix = (*scoreMatrix)[:needed] *pathMatrix = (*pathMatrix)[:needed] + // Sets the first position of the matrix with 0 score _SetMatrices(scoreMatrix, pathMatrix, la, -1, -1, 0, 0) - // Fills the first column with score 0 + // Fills the first column with scores corresponding to a set of gaps for i := 0; i < la; i++ { _SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gapPenalty, -1) } - lb1 := lb - 1 + lb1 := lb - 1 // Except the last column (gaps are free on it) for j := 0; j < lb1; j++ { + // Fill the first line with zero score _SetMatrices(scoreMatrix, pathMatrix, la, -1, j, 0, 1) for i := 0; i < la; i++ { @@ -187,7 +203,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64, top += gapPenalty switch { - case diag > left && left > top: + case diag > left && diag > top: _SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0) case left > diag && left > top: _SetMatrices(scoreMatrix, pathMatrix, la, i, j, left, +1) @@ -218,6 +234,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64, } return _GetMatrix(scoreMatrix, la, la-1, lb1) + } func PELeftAlign(seqA, seqB *obiseq.BioSequence, gap float64, @@ -295,11 +312,9 @@ func PEAlign(seqA, seqB *obiseq.BioSequence, over = seqB.Len() + shift } - + // At least one mismatch exists in the overlaping region if fastScore+3 < over { - // At least one mismatch exists in the overlaping region - if shift > 0 { startA = shift - delta if startA < 0 { @@ -319,8 +334,6 @@ func PEAlign(seqA, seqB *obiseq.BioSequence, &arena.pointer.pathMatrix) } else { - // Both overlaping regions are identicals - startA = 0 startB = -shift - delta if startB < 0 { @@ -333,6 +346,7 @@ func PEAlign(seqA, seqB *obiseq.BioSequence, rawSeqA = seqA.Sequence()[:partLen] qualSeqA = seqA.Qualities()[:partLen] extra3 = partLen - seqA.Len() + score = _FillMatrixPeRightAlign( rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, &arena.pointer.scoreMatrix, @@ -344,6 +358,9 @@ func PEAlign(seqA, seqB *obiseq.BioSequence, &arena.pointer.path) } else { + + // Both overlaping regions are identicals + if shift > 0 { startA = shift startB = 0 diff --git a/pkg/obiseq/biosequenceslice.go b/pkg/obiseq/biosequenceslice.go index 78558a4..5969ef1 100644 --- a/pkg/obiseq/biosequenceslice.go +++ b/pkg/obiseq/biosequenceslice.go @@ -19,13 +19,19 @@ var _BioSequenceSlicePool = sync.Pool{ } // > This function returns a pointer to a new `BioSequenceSlice` object -func NewBioSequenceSlice() *BioSequenceSlice { - return _BioSequenceSlicePool.Get().(*BioSequenceSlice) +func NewBioSequenceSlice(size ...int) *BioSequenceSlice { + slice := _BioSequenceSlicePool.Get().(*BioSequenceSlice) + if len(size) > 0 { + s := size[0] + slice.InsureCapacity(s) + (*slice)=(*slice)[0:s] + } + return slice } // `MakeBioSequenceSlice()` returns a pointer to a new `BioSequenceSlice` struct -func MakeBioSequenceSlice() BioSequenceSlice { - return *NewBioSequenceSlice() +func MakeBioSequenceSlice(size ...int) BioSequenceSlice { + return *NewBioSequenceSlice(size...) } func (s *BioSequenceSlice) Recycle() { diff --git a/pkg/obiseq/worker.go b/pkg/obiseq/worker.go index adecc0f..6a1f5e7 100644 --- a/pkg/obiseq/worker.go +++ b/pkg/obiseq/worker.go @@ -5,6 +5,10 @@ type SeqAnnotator func(*BioSequence) type SeqWorker func(*BioSequence) *BioSequence type SeqSliceWorker func(BioSequenceSlice) BioSequenceSlice +func NilSeqWorker(seq *BioSequence) *BioSequence { + return seq +} + func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker { f := func(seq *BioSequence) *BioSequence { function(seq) @@ -14,14 +18,57 @@ func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker { } func SeqToSliceWorker(worker SeqWorker, inplace bool) SeqSliceWorker { - f := func(input BioSequenceSlice) BioSequenceSlice { - output := input - if (! inplace) { - output = MakeBioSequenceSlice() + var f SeqSliceWorker + + if worker == nil { + if inplace { + f = func(input BioSequenceSlice) BioSequenceSlice { + return input + } + } else { + f = func(input BioSequenceSlice) BioSequenceSlice { + output := MakeBioSequenceSlice(len(input)) + copy(output,input) + return output + } } - for i,s := range(input) { + } else { + f = func(input BioSequenceSlice) BioSequenceSlice { + output := input + if !inplace { + output = MakeBioSequenceSlice(len(input)) + } + for i, s := range input { output[i] = worker(s) } + + return output + } + } + + return f +} + +func SeqToSliceConditionalWorker(worker SeqWorker, + condition SequencePredicate, + inplace bool) SeqSliceWorker { + + if condition == nil { + return SeqToSliceWorker(worker,inplace) + } + + f := func(input BioSequenceSlice) BioSequenceSlice { + output := input + if !inplace { + output = MakeBioSequenceSlice(len(input)) + } + for i, s := range input { + if condition(s) { + output[i] = worker(s) + } else { + output[i] = s + } + } return output } @@ -29,3 +76,18 @@ func SeqToSliceWorker(worker SeqWorker, inplace bool) SeqSliceWorker { return f } +func (worker SeqWorker) ChainWorkers(next SeqWorker) SeqWorker { + if worker == nil { + return next + } else { + if next == nil { + return worker + } + } + + f := func(seq *BioSequence) *BioSequence { + return next(worker(seq)) + } + + return f +}