Files
obitools4/pkg/obialign/pairedendalign.go

386 lines
10 KiB
Go
Raw Normal View History

2022-01-13 23:27:39 +01:00
package obialign
import (
2022-02-24 12:14:52 +01:00
log "github.com/sirupsen/logrus"
2022-01-13 23:27:39 +01:00
2022-01-13 23:43:01 +01:00
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
2022-01-13 23:27:39 +01:00
)
2022-01-14 15:46:36 +01:00
type _PeAlignArena struct {
scoreMatrix []int
pathMatrix []int
path []int
fastIndex [][]int
fastBuffer []byte
2022-01-13 23:27:39 +01:00
}
2022-01-14 15:46:36 +01:00
// PEAlignArena defines memory arena usable by the
// Paired-End alignment related functions. The same arena can be reused
// from alignment to alignment to limit memory allocation
// and desallocation process.
2022-01-13 23:27:39 +01:00
type PEAlignArena struct {
2022-01-14 15:46:36 +01:00
pointer *_PeAlignArena
2022-01-13 23:27:39 +01:00
}
2022-01-14 15:46:36 +01:00
// NilPEAlignArena is the nil instance of the PEAlignArena
// type.
2022-01-13 23:27:39 +01:00
var NilPEAlignArena = PEAlignArena{nil}
2022-01-14 15:46:36 +01:00
// MakePEAlignArena makes a new arena for the alignment of two paired sequences
// of maximum length indicated by lseqA and lseqB.
2022-01-13 23:27:39 +01:00
func MakePEAlignArena(lseqA, lseqB int) PEAlignArena {
2022-01-14 15:46:36 +01:00
a := _PeAlignArena{
scoreMatrix: make([]int, 0, (lseqA+1)*(lseqB+1)),
pathMatrix: make([]int, 0, (lseqA+1)*(lseqB+1)),
path: make([]int, 2*(lseqA+lseqB)),
fastIndex: make([][]int, 256),
fastBuffer: make([]byte, 0, lseqA),
2022-01-13 23:27:39 +01:00
}
return PEAlignArena{&a}
}
2022-01-14 15:46:36 +01:00
func _SetMatrices(matrixA, matrixB *[]int, lenA, a, b, valueA, valueB int) {
2022-01-13 23:27:39 +01:00
i := (b+1)*(lenA+1) + a + 1
(*matrixA)[i] = valueA
(*matrixB)[i] = valueB
}
2022-01-14 15:46:36 +01:00
func _GetMatrix(matrix *[]int, lenA, a, b int) int {
2022-01-13 23:27:39 +01:00
return (*matrix)[(b+1)*(lenA+1)+a+1]
}
2022-01-14 15:46:36 +01:00
func _GetMatrixFrom(matrix *[]int, lenA, a, b int) (int, int, int) {
2022-01-13 23:27:39 +01:00
i := (b+1)*(lenA+1) + a
j := i - lenA
m := *matrix
return m[j], m[j-1], m[i]
}
2022-01-14 15:46:36 +01:00
func _PairingScorePeAlign(baseA, qualA, baseB, qualB byte) int {
partMatch := _NucPartMatch[baseA&31][baseB&31]
2022-01-13 23:27:39 +01:00
// log.Printf("id : %f A : %s %d B : %s %d\n", part_match, string(baseA), qualA, string(baseB), qualB)
switch int(partMatch * 100) {
case 100:
2022-01-14 15:36:39 +01:00
return _NucScorePartMatchMatch[qualA][qualB]
case 0:
2022-01-14 15:36:39 +01:00
return _NucScorePartMatchMismatch[qualA][qualB]
2022-01-13 23:27:39 +01:00
default:
2022-01-14 15:46:36 +01:00
return int(partMatch*float64(_NucScorePartMatchMatch[qualA][qualB]) +
(1-partMatch)*float64(_NucScorePartMatchMismatch[qualA][qualB]) + 0.5)
2022-01-13 23:27:39 +01:00
}
}
func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64,
2022-01-14 15:46:36 +01:00
scoreMatrix, pathMatrix *[]int) int {
2022-01-13 23:27:39 +01:00
la := len(seqA)
lb := len(seqB)
// The actual gap score is the gap score times the mismatch between
// two bases with a score of 40
gapPenalty := int(gap * float64(_NucScorePartMatchMismatch[40][40]))
2022-01-13 23:27:39 +01:00
needed := (la + 1) * (lb + 1)
2022-01-14 15:46:36 +01:00
if needed > cap(*scoreMatrix) {
*scoreMatrix = make([]int, needed)
2022-01-13 23:27:39 +01:00
}
2022-01-14 15:46:36 +01:00
if needed > cap(*pathMatrix) {
*pathMatrix = make([]int, needed)
2022-01-13 23:27:39 +01:00
}
2022-01-14 15:46:36 +01:00
*scoreMatrix = (*scoreMatrix)[:needed]
*pathMatrix = (*pathMatrix)[:needed]
2022-01-13 23:27:39 +01:00
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, -1, -1, 0, 0)
2022-01-13 23:27:39 +01:00
// Fills the first column with score 0
for i := 0; i < la; i++ {
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, 0, -1)
2022-01-13 23:27:39 +01:00
}
la1 := la - 1
for j := 0; j < lb; j++ {
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gapPenalty, 1)
2022-01-13 23:27:39 +01:00
for i := 0; i < la1; i++ {
2022-01-14 15:46:36 +01:00
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j])
left += gapPenalty
top += gapPenalty
2022-01-13 23:27:39 +01:00
switch {
case diag > left && diag > top:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0)
2022-01-13 23:27:39 +01:00
case left > diag && left > top:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, left, +1)
2022-01-13 23:27:39 +01:00
default:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, top, -1)
2022-01-13 23:27:39 +01:00
}
}
// Special case for the last line Left gap are free
2022-01-14 15:46:36 +01:00
left, diag, top := _GetMatrixFrom(scoreMatrix, la, la1, j)
diag += _PairingScorePeAlign(seqA[la1], qualA[la1], seqB[j], qualB[j])
top += gapPenalty
2022-01-13 23:27:39 +01:00
switch {
case diag > left && diag > top:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, la1, j, diag, 0)
2022-01-13 23:27:39 +01:00
case left > diag && left > top:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, la1, j, left, +1)
2022-01-13 23:27:39 +01:00
default:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, la1, j, top, -1)
2022-01-13 23:27:39 +01:00
}
}
2022-01-14 15:46:36 +01:00
return _GetMatrix(scoreMatrix, la, la1, lb-1)
2022-01-13 23:27:39 +01:00
}
func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64,
2022-01-14 15:46:36 +01:00
scoreMatrix, pathMatrix *[]int) int {
2022-01-13 23:27:39 +01:00
la := len(seqA)
lb := len(seqB)
// The actual gap score is the gap score times the mismatch between
// two bases with a score of 40
gapPenalty := int(gap * float64(_NucScorePartMatchMismatch[40][40]))
2022-01-13 23:27:39 +01:00
needed := (la + 1) * (lb + 1)
2022-01-14 15:46:36 +01:00
if needed > cap(*scoreMatrix) {
*scoreMatrix = make([]int, needed)
2022-01-13 23:27:39 +01:00
}
2022-01-14 15:46:36 +01:00
if needed > cap(*pathMatrix) {
*pathMatrix = make([]int, needed)
2022-01-13 23:27:39 +01:00
}
2022-01-14 15:46:36 +01:00
*scoreMatrix = (*scoreMatrix)[:needed]
*pathMatrix = (*pathMatrix)[:needed]
2022-01-13 23:27:39 +01:00
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, -1, -1, 0, 0)
2022-01-13 23:27:39 +01:00
// Fills the first column with score 0
for i := 0; i < la; i++ {
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gapPenalty, -1)
2022-01-13 23:27:39 +01:00
}
lb1 := lb - 1
for j := 0; j < lb1; j++ {
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, 0, 1)
2022-01-13 23:27:39 +01:00
for i := 0; i < la; i++ {
2022-01-14 15:46:36 +01:00
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
2022-01-13 23:27:39 +01:00
2022-01-14 15:46:36 +01:00
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j])
left += gapPenalty
top += gapPenalty
2022-01-13 23:27:39 +01:00
switch {
case diag > left && left > top:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0)
2022-01-13 23:27:39 +01:00
case left > diag && left > top:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, left, +1)
2022-01-13 23:27:39 +01:00
default:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, top, -1)
2022-01-13 23:27:39 +01:00
}
}
}
// Special case for the last colump Up gap are free
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, -1, lb1, 0, 1)
2022-01-13 23:27:39 +01:00
for i := 0; i < la; i++ {
2022-01-14 15:46:36 +01:00
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, lb1)
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[lb1], qualB[lb1])
left += gapPenalty
2022-01-13 23:27:39 +01:00
switch {
case diag > left && diag > top:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, i, lb1, diag, 0)
2022-01-13 23:27:39 +01:00
case left > diag && left > top:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, i, lb1, left, +1)
2022-01-13 23:27:39 +01:00
default:
2022-01-14 15:46:36 +01:00
_SetMatrices(scoreMatrix, pathMatrix, la, i, lb1, top, -1)
2022-01-13 23:27:39 +01:00
}
}
2022-01-14 15:46:36 +01:00
return _GetMatrix(scoreMatrix, la, la-1, lb1)
2022-01-13 23:27:39 +01:00
}
func PELeftAlign(seqA, seqB *obiseq.BioSequence, gap float64,
arena PEAlignArena) (int, []int) {
2022-01-13 23:27:39 +01:00
2022-01-14 15:36:39 +01:00
if !_InitializedDnaScore {
2022-01-13 23:27:39 +01:00
log.Println("Initializing the DNA Scoring matrix")
2022-01-14 15:36:39 +01:00
_InitDNAScoreMatrix()
2022-01-13 23:27:39 +01:00
}
if arena.pointer == nil {
arena = MakePEAlignArena(seqA.Length(), seqB.Length())
}
2022-01-14 15:46:36 +01:00
score := _FillMatrixPeLeftAlign(seqA.Sequence(), seqA.Qualities(),
2022-01-13 23:27:39 +01:00
seqB.Sequence(), seqB.Qualities(), gap,
2022-01-14 15:46:36 +01:00
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
2022-01-13 23:27:39 +01:00
2022-01-14 15:46:36 +01:00
arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
2022-01-13 23:27:39 +01:00
seqA.Length(), seqB.Length(),
&arena.pointer.path)
return score, arena.pointer.path
}
func PERightAlign(seqA, seqB *obiseq.BioSequence, gap float64,
arena PEAlignArena) (int, []int) {
2022-01-13 23:27:39 +01:00
2022-01-14 15:36:39 +01:00
if !_InitializedDnaScore {
2022-01-13 23:27:39 +01:00
log.Println("Initializing the DNA Scoring matrix")
2022-01-14 15:36:39 +01:00
_InitDNAScoreMatrix()
2022-01-13 23:27:39 +01:00
}
if arena.pointer == nil {
arena = MakePEAlignArena(seqA.Length(), seqB.Length())
}
2022-01-14 15:46:36 +01:00
score := _FillMatrixPeRightAlign(seqA.Sequence(), seqA.Qualities(),
2022-01-13 23:27:39 +01:00
seqB.Sequence(), seqB.Qualities(), gap,
2022-01-14 15:46:36 +01:00
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
2022-01-13 23:27:39 +01:00
2022-01-14 15:46:36 +01:00
arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
2022-01-13 23:27:39 +01:00
seqA.Length(), seqB.Length(),
&arena.pointer.path)
return score, arena.pointer.path
}
func PEAlign(seqA, seqB *obiseq.BioSequence,
gap float64, delta int,
2022-01-13 23:27:39 +01:00
arena PEAlignArena) (int, []int) {
var score, shift int
var startA, startB int
2022-01-14 16:10:19 +01:00
var partLen, over int
var rawSeqA, qualSeqA []byte
var rawSeqB, qualSeqB []byte
2022-01-13 23:27:39 +01:00
var extra5, extra3 int
2022-01-14 15:36:39 +01:00
if !_InitializedDnaScore {
2022-01-13 23:27:39 +01:00
log.Println("Initializing the DNA Scoring matrix")
2022-01-14 15:36:39 +01:00
_InitDNAScoreMatrix()
2022-01-13 23:27:39 +01:00
}
index := obikmer.Index4mer(seqA,
2022-01-14 15:46:36 +01:00
&arena.pointer.fastIndex,
&arena.pointer.fastBuffer)
2022-01-13 23:27:39 +01:00
2022-01-14 16:10:19 +01:00
shift, fastScore := obikmer.FastShiftFourMer(index, seqB, nil)
2022-01-13 23:27:39 +01:00
if shift > 0 {
over = seqA.Length() - shift
} else {
over = seqB.Length() + shift
}
// log.Println(seqA.String())
// log.Println(seqB.String())
// log.Printf("Shift : %d Score : %d Over : %d La : %d:%d Lb: %d:%d\n", shift, fastScore, over, seqA.Length(), len(seqA.Qualities()), seqB.Length(), len(seqB.Qualities()))
2022-01-14 16:10:19 +01:00
if fastScore+3 < over {
2022-01-16 00:21:42 +01:00
// At least one mismatch exists in the overlaping region
2022-01-13 23:27:39 +01:00
if shift > 0 {
startA = shift - delta
if startA < 0 {
startA = 0
}
extra5 = -startA
startB = 0
2022-01-14 16:10:19 +01:00
rawSeqA = seqA.Sequence()[startA:]
qualSeqA = seqA.Qualities()[startA:]
partLen = len(rawSeqA)
rawSeqB = seqB.Sequence()[0:partLen]
qualSeqB = seqB.Qualities()[0:partLen]
extra3 = seqB.Length() - partLen
2022-01-14 15:46:36 +01:00
score = _FillMatrixPeLeftAlign(
2022-01-14 16:10:19 +01:00
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
2022-01-14 15:46:36 +01:00
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
2022-01-13 23:27:39 +01:00
} else {
2022-01-16 00:21:42 +01:00
// Both overlaping regions are identicals
2022-01-13 23:27:39 +01:00
startA = 0
startB = -shift - delta
if startB < 0 {
startB = 0
}
extra5 = startB
2022-01-14 16:10:19 +01:00
rawSeqB = seqB.Sequence()[startB:]
qualSeqB = seqB.Qualities()[startB:]
partLen = len(rawSeqB)
rawSeqA = seqA.Sequence()[:partLen]
qualSeqA = seqA.Qualities()[:partLen]
extra3 = partLen - seqA.Length()
2022-01-14 15:46:36 +01:00
score = _FillMatrixPeRightAlign(
2022-01-14 16:10:19 +01:00
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
2022-01-14 15:46:36 +01:00
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
2022-01-13 23:27:39 +01:00
}
2022-01-14 15:46:36 +01:00
arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
2022-01-14 16:10:19 +01:00
len(rawSeqA), len(rawSeqB),
2022-01-13 23:27:39 +01:00
&arena.pointer.path)
} else {
if shift > 0 {
startA = shift
startB = 0
extra5 = -startA
2022-01-14 16:10:19 +01:00
qualSeqA = seqA.Qualities()[startA:]
partLen = len(qualSeqA)
qualSeqB = seqB.Qualities()[0:partLen]
extra3 = seqB.Length() - partLen
2022-01-13 23:27:39 +01:00
score = 0
} else {
startA = 0
startB = -shift
extra5 = startB
2022-01-14 16:10:19 +01:00
qualSeqB = seqB.Qualities()[startB:]
partLen = len(qualSeqB)
extra3 = partLen - seqA.Length()
qualSeqA = seqA.Qualities()[:partLen]
2022-01-13 23:27:39 +01:00
}
score = 0
2022-01-14 16:10:19 +01:00
for i, qualA := range qualSeqA {
qualB := qualSeqB[i]
2022-01-14 15:36:39 +01:00
score += _NucScorePartMatchMatch[qualA][qualB]
2022-01-13 23:27:39 +01:00
}
arena.pointer.path = arena.pointer.path[:0]
2022-01-14 16:10:19 +01:00
arena.pointer.path = append(arena.pointer.path, 0, partLen)
2022-01-13 23:27:39 +01:00
}
arena.pointer.path[0] += extra5
if arena.pointer.path[len(arena.pointer.path)-1] == 0 {
arena.pointer.path[len(arena.pointer.path)-2] += extra3
} else {
arena.pointer.path = append(arena.pointer.path, extra3, 0)
}
return score, arena.pointer.path
}