Patch an aligment bug on obipairing

This commit is contained in:
2023-01-25 13:22:56 +01:00
parent 9b7edf0908
commit cfddc78161
4 changed files with 112 additions and 27 deletions

View File

@ -55,21 +55,21 @@ func _MatchScoreRatio(a, b byte) (float64, float64) {
l2 := math.Log(2) l2 := math.Log(2)
l3 := math.Log(3) l3 := math.Log(3)
l4 := math.Log(4)
l10 := math.Log(10) l10 := math.Log(10)
lE1 := -float64(a)/10*l10 - l4 lalea := math.Log(30) // 1 /(change of the random model)
lE2 := -float64(b)/10*l10 - l4 lE1 := -float64(a)/10*l10 - l3 // log proba of sequencing error on A/3
lO1 := math.Log1p(-math.Exp(lE1 + l3)) lE2 := -float64(b)/10*l10 - l3 // log proba of sequencing error on B/3
lO2 := math.Log1p(-math.Exp(lE2 + l3)) lO1 := math.Log1p(-math.Exp(lE1 + l3)) // log proba no being an error on A
lO2 := math.Log1p(-math.Exp(lE2 + l3)) // log proba no being an error on B
lO1O2 := lO1 + lO2 lO1O2 := lO1 + lO2
lE1E2 := lE1 + lE2 lE1E2 := lE1 + lE2
lO1E2 := lO1 + lE2 lO1E2 := lO1 + lE2
lO2E1 := lO2 + lE1 lO2E1 := lO2 + lE1
MM := _Logaddexp(lO1O2, lE1E2+l3) + l4 MM := _Logaddexp(lO1O2, lE1E2+l3) // Proba match when match observed
Mm := _Logaddexp(_Logaddexp(lO1E2, lO2E1), lE1E2+l2) + l4 Mm := _Logaddexp(_Logaddexp(lO1E2, lO2E1), lE1E2+l2) // Proba match when mismatch observed
return MM, Mm return MM + lalea, Mm + lalea
} }
func _InitNucPartMatch() { func _InitNucPartMatch() {

View File

@ -52,6 +52,7 @@ func _GetMatrix(matrix *[]int, lenA, a, b int) int {
} }
func _GetMatrixFrom(matrix *[]int, lenA, a, b int) (int, int, int) { func _GetMatrixFrom(matrix *[]int, lenA, a, b int) (int, int, int) {
// Formula rechecked on 01/25/2023
i := (b+1)*(lenA+1) + a i := (b+1)*(lenA+1) + a
j := i - lenA j := i - lenA
m := *matrix m := *matrix
@ -68,10 +69,15 @@ func _PairingScorePeAlign(baseA, qualA, baseB, qualB byte) int {
return _NucScorePartMatchMismatch[qualA][qualB] return _NucScorePartMatchMismatch[qualA][qualB]
default: default:
return int(partMatch*float64(_NucScorePartMatchMatch[qualA][qualB]) + return int(partMatch*float64(_NucScorePartMatchMatch[qualA][qualB]) +
(1-partMatch)*float64(_NucScorePartMatchMismatch[qualA][qualB]) + 0.5) (1-partMatch)*float64(_NucScorePartMatchMismatch[qualA][qualB]) +
0.5)
} }
} }
// Gaps at the beginning of B and at the end of A are free
// With A spanning over lines and B over columns
// - First column gap = 0
// - Last line gaps = 0
func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64, func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64,
scoreMatrix, pathMatrix *[]int) int { scoreMatrix, pathMatrix *[]int) int {
@ -95,6 +101,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64,
*scoreMatrix = (*scoreMatrix)[:needed] *scoreMatrix = (*scoreMatrix)[:needed]
*pathMatrix = (*pathMatrix)[:needed] *pathMatrix = (*pathMatrix)[:needed]
// Sets the first position of the matrix with 0 score
_SetMatrices(scoreMatrix, pathMatrix, la, -1, -1, 0, 0) _SetMatrices(scoreMatrix, pathMatrix, la, -1, -1, 0, 0)
// Fills the first column with score 0 // Fills the first column with score 0
@ -102,17 +109,20 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64,
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, 0, -1) _SetMatrices(scoreMatrix, pathMatrix, la, i, -1, 0, -1)
} }
la1 := la - 1 la1 := la - 1 // Except the last line (gaps are free on it)
for j := 0; j < lb; j++ { for j := 0; j < lb; j++ {
// Fill the first line with scores corresponding to a set of gaps
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gapPenalty, 1) _SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gapPenalty, 1)
for i := 0; i < la1; i++ { for i := 0; i < la1; i++ {
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j) left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j]) diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j])
left += gapPenalty left += gapPenalty
top += gapPenalty top += gapPenalty
switch { switch {
case diag > left && diag > top: case diag > left && diag > top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0) _SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0)
@ -143,6 +153,10 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64,
return _GetMatrix(scoreMatrix, la, la1, lb-1) return _GetMatrix(scoreMatrix, la, la1, lb-1)
} }
// Gaps at the beginning of A and at the end of B are free
// With A spanning over lines and B over columns
// - First line gap = 0
// - Last column gaps = 0
func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64, func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64,
scoreMatrix, pathMatrix *[]int) int { scoreMatrix, pathMatrix *[]int) int {
@ -166,17 +180,19 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64,
*scoreMatrix = (*scoreMatrix)[:needed] *scoreMatrix = (*scoreMatrix)[:needed]
*pathMatrix = (*pathMatrix)[:needed] *pathMatrix = (*pathMatrix)[:needed]
// Sets the first position of the matrix with 0 score
_SetMatrices(scoreMatrix, pathMatrix, la, -1, -1, 0, 0) _SetMatrices(scoreMatrix, pathMatrix, la, -1, -1, 0, 0)
// Fills the first column with score 0 // Fills the first column with scores corresponding to a set of gaps
for i := 0; i < la; i++ { for i := 0; i < la; i++ {
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gapPenalty, -1) _SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gapPenalty, -1)
} }
lb1 := lb - 1 lb1 := lb - 1 // Except the last column (gaps are free on it)
for j := 0; j < lb1; j++ { for j := 0; j < lb1; j++ {
// Fill the first line with zero score
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, 0, 1) _SetMatrices(scoreMatrix, pathMatrix, la, -1, j, 0, 1)
for i := 0; i < la; i++ { for i := 0; i < la; i++ {
@ -187,7 +203,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64,
top += gapPenalty top += gapPenalty
switch { switch {
case diag > left && left > top: case diag > left && diag > top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0) _SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0)
case left > diag && left > top: case left > diag && left > top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, left, +1) _SetMatrices(scoreMatrix, pathMatrix, la, i, j, left, +1)
@ -218,6 +234,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64,
} }
return _GetMatrix(scoreMatrix, la, la-1, lb1) return _GetMatrix(scoreMatrix, la, la-1, lb1)
} }
func PELeftAlign(seqA, seqB *obiseq.BioSequence, gap float64, func PELeftAlign(seqA, seqB *obiseq.BioSequence, gap float64,
@ -295,10 +312,8 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
over = seqB.Len() + shift over = seqB.Len() + shift
} }
if fastScore+3 < over {
// At least one mismatch exists in the overlaping region // At least one mismatch exists in the overlaping region
if fastScore+3 < over {
if shift > 0 { if shift > 0 {
startA = shift - delta startA = shift - delta
@ -319,8 +334,6 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
&arena.pointer.pathMatrix) &arena.pointer.pathMatrix)
} else { } else {
// Both overlaping regions are identicals
startA = 0 startA = 0
startB = -shift - delta startB = -shift - delta
if startB < 0 { if startB < 0 {
@ -333,6 +346,7 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
rawSeqA = seqA.Sequence()[:partLen] rawSeqA = seqA.Sequence()[:partLen]
qualSeqA = seqA.Qualities()[:partLen] qualSeqA = seqA.Qualities()[:partLen]
extra3 = partLen - seqA.Len() extra3 = partLen - seqA.Len()
score = _FillMatrixPeRightAlign( score = _FillMatrixPeRightAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix, &arena.pointer.scoreMatrix,
@ -344,6 +358,9 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
&arena.pointer.path) &arena.pointer.path)
} else { } else {
// Both overlaping regions are identicals
if shift > 0 { if shift > 0 {
startA = shift startA = shift
startB = 0 startB = 0

View File

@ -19,13 +19,19 @@ var _BioSequenceSlicePool = sync.Pool{
} }
// > This function returns a pointer to a new `BioSequenceSlice` object // > This function returns a pointer to a new `BioSequenceSlice` object
func NewBioSequenceSlice() *BioSequenceSlice { func NewBioSequenceSlice(size ...int) *BioSequenceSlice {
return _BioSequenceSlicePool.Get().(*BioSequenceSlice) slice := _BioSequenceSlicePool.Get().(*BioSequenceSlice)
if len(size) > 0 {
s := size[0]
slice.InsureCapacity(s)
(*slice)=(*slice)[0:s]
}
return slice
} }
// `MakeBioSequenceSlice()` returns a pointer to a new `BioSequenceSlice` struct // `MakeBioSequenceSlice()` returns a pointer to a new `BioSequenceSlice` struct
func MakeBioSequenceSlice() BioSequenceSlice { func MakeBioSequenceSlice(size ...int) BioSequenceSlice {
return *NewBioSequenceSlice() return *NewBioSequenceSlice(size...)
} }
func (s *BioSequenceSlice) Recycle() { func (s *BioSequenceSlice) Recycle() {

View File

@ -5,6 +5,10 @@ type SeqAnnotator func(*BioSequence)
type SeqWorker func(*BioSequence) *BioSequence type SeqWorker func(*BioSequence) *BioSequence
type SeqSliceWorker func(BioSequenceSlice) BioSequenceSlice type SeqSliceWorker func(BioSequenceSlice) BioSequenceSlice
func NilSeqWorker(seq *BioSequence) *BioSequence {
return seq
}
func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker { func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
f := func(seq *BioSequence) *BioSequence { f := func(seq *BioSequence) *BioSequence {
function(seq) function(seq)
@ -14,13 +18,56 @@ func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
} }
func SeqToSliceWorker(worker SeqWorker, inplace bool) SeqSliceWorker { func SeqToSliceWorker(worker SeqWorker, inplace bool) SeqSliceWorker {
var f SeqSliceWorker
if worker == nil {
if inplace {
f = func(input BioSequenceSlice) BioSequenceSlice {
return input
}
} else {
f = func(input BioSequenceSlice) BioSequenceSlice {
output := MakeBioSequenceSlice(len(input))
copy(output,input)
return output
}
}
} else {
f = func(input BioSequenceSlice) BioSequenceSlice {
output := input
if !inplace {
output = MakeBioSequenceSlice(len(input))
}
for i, s := range input {
output[i] = worker(s)
}
return output
}
}
return f
}
func SeqToSliceConditionalWorker(worker SeqWorker,
condition SequencePredicate,
inplace bool) SeqSliceWorker {
if condition == nil {
return SeqToSliceWorker(worker,inplace)
}
f := func(input BioSequenceSlice) BioSequenceSlice { f := func(input BioSequenceSlice) BioSequenceSlice {
output := input output := input
if (! inplace) { if !inplace {
output = MakeBioSequenceSlice() output = MakeBioSequenceSlice(len(input))
} }
for i,s := range(input) { for i, s := range input {
if condition(s) {
output[i] = worker(s) output[i] = worker(s)
} else {
output[i] = s
}
} }
return output return output
@ -29,3 +76,18 @@ func SeqToSliceWorker(worker SeqWorker, inplace bool) SeqSliceWorker {
return f return f
} }
func (worker SeqWorker) ChainWorkers(next SeqWorker) SeqWorker {
if worker == nil {
return next
} else {
if next == nil {
return worker
}
}
f := func(seq *BioSequence) *BioSequence {
return next(worker(seq))
}
return f
}