Files
obitools4/pkg/obialign/pairedendalign.go
Eric Coissac 51d11aa36d Changes to be committed:
modified:   pkg/obialign/alignment.go
	modified:   pkg/obialign/pairedendalign.go
	modified:   pkg/obioptions/version.go
	modified:   pkg/obitools/obipairing/pairing.go
2025-02-23 17:37:56 +01:00

653 lines
18 KiB
Go

package obialign
import (
"log"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
type _PeAlignArena struct {
scoreMatrix []int
pathMatrix []int
path []int
fastIndex [][]int
fastBuffer []byte
aligneSeqA *[]byte
aligneSeqB *[]byte
aligneQualA *[]byte
aligneQualB *[]byte
}
// PEAlignArena defines memory arena usable by the
// Paired-End alignment related functions. The same arena can be reused
// from alignment to alignment to limit memory allocation
// and desallocation process.
type PEAlignArena struct {
pointer *_PeAlignArena
}
// NilPEAlignArena is the nil instance of the PEAlignArena
// type.
var NilPEAlignArena = PEAlignArena{nil}
// MakePEAlignArena makes a new arena for the alignment of two paired sequences
// of maximum length indicated by lseqA and lseqB.
func MakePEAlignArena(lseqA, lseqB int) PEAlignArena {
aligneSeqA := make([]byte, 0, lseqA+lseqB)
aligneSeqB := make([]byte, 0, lseqA+lseqB)
aligneQualA := make([]byte, 0, lseqA+lseqB)
aligneQualB := make([]byte, 0, lseqA+lseqB)
a := _PeAlignArena{
scoreMatrix: make([]int, 0, (lseqA+1)*(lseqB+1)),
pathMatrix: make([]int, 0, (lseqA+1)*(lseqB+1)),
path: make([]int, 2*(lseqA+lseqB)),
fastIndex: make([][]int, 256),
fastBuffer: make([]byte, 0, lseqA),
aligneSeqA: &aligneSeqA,
aligneSeqB: &aligneSeqB,
aligneQualA: &aligneQualA,
aligneQualB: &aligneQualB,
}
return PEAlignArena{&a}
}
// _SetMatrices updates the values in matrixA and matrixB at the specified indices with the given values.
//
// The data in the matrix is stored in column-major order.
// Positions in the matrix are numbered from -1
//
// Parameters:
// - matrixA: a pointer to the slice of integers representing matrixA.
// - matrixB: a pointer to the slice of integers representing matrixB.
// - lenA: the length of matrixA.
// - a: the row index a.
// - b: the column index b .
// - valueA: the value to be set in matrixA.
// - valueB: the value to be set in matrixB.
func _SetMatrices(matrixA, matrixB *[]int, lenA, a, b, valueA, valueB int) {
i := (b+1)*(lenA+1) + a + 1
(*matrixA)[i] = valueA
(*matrixB)[i] = valueB
}
// _GetMatrix returns the value at the specified position in the matrix.
//
// The data in the matrix is stored in column-major order.
// Positions in the matrix are numbered from -1
//
// Parameters:
// - matrix: a pointer to the matrix
// - lenA: the length of the matrix
// - a: the row index a.
// - b: the column index b .
//
// Returns:
// - int: the value at the specified position in the matrix
func _GetMatrix(matrix *[]int, lenA, a, b int) int {
return (*matrix)[(b+1)*(lenA+1)+a+1]
}
// Returns left, diag, top compare to the position (a, b)
//
// with a the row index and b the column index.
// Positions in the matrix are numbered from -1
//
// i = (b+1)*(lenA+1) + a + 1
//
// diag = M[a-1][b-1] : i_diag = ((b-1)+1)*(lenA+1) + (a-1) + 1
//
// : i_diag = b*(lenA+1) + a
//
// left = M[a][b-1] : i_left = ((b-1)+1)*(lenA+1) + a + 1
//
// : i_left = b*(lenA+1) + a + 1
//
// top = M[a-1][b] : i_top = (b+1)*(lenA+1) + (a-1) + 1
//
// : i_top = (b+1)*(lenA+1) + a
//
// Parameters:
//
// - matrix: a pointer to the matrix
// - lenA: the number of row -1 in the matrix
// - a: the row index a.
// - b: the column index b .
func _GetMatrixFrom(matrix *[]int, lenA, a, b int) (int, int, int) {
// Formula rechecked on 11/24/2023
i_top := (b+1)*(lenA+1) + a
i_left := i_top - lenA
i_diag := i_left - 1
m := *matrix
return m[i_left], m[i_diag], m[i_top]
}
func _PairingScorePeAlign(baseA, qualA, baseB, qualB byte, scale float64) int {
partMatch := _NucPartMatch[baseA&31][baseB&31]
// log.Printf("id : %f A : %s %d B : %s %d\n", part_match, string(baseA), qualA, string(baseB), qualB)
switch int(partMatch * 100) {
case 100:
return _NucScorePartMatchMatch[qualA][qualB]
case 0:
return int(float64(_NucScorePartMatchMismatch[qualA][qualB])*scale + 0.5)
default:
return int(partMatch*float64(_NucScorePartMatchMatch[qualA][qualB]) +
(1-partMatch)*float64(_NucScorePartMatchMismatch[qualA][qualB])*scale +
0.5)
}
}
// Gaps at the beginning of seqB and at the end of seqA are free
// With seqA spanning over lines and seqB over columns
// - First column gap = 0
// - Last line gaps = 0
//
// Paths are encoded :
// - 0 : for diagonal
// - -1 : for top
// - +1 : for left
func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap, scale float64,
scoreMatrix, pathMatrix *[]int) int {
la := len(seqA)
lb := len(seqB)
// The actual gap score is the gap score times the mismatch between
// two bases with a score of 40
gapPenalty := int(scale*gap*float64(_NucScorePartMatchMismatch[40][40]) + 0.5)
needed := (la + 1) * (lb + 1)
if needed > cap(*scoreMatrix) {
*scoreMatrix = make([]int, needed)
}
if needed > cap(*pathMatrix) {
*pathMatrix = make([]int, needed)
}
*scoreMatrix = (*scoreMatrix)[:needed]
*pathMatrix = (*pathMatrix)[:needed]
// Sets the first position of the matrix with 0 score
_SetMatrices(scoreMatrix, pathMatrix, la, -1, -1, 0, 0)
// Fills the first column with score 0
for i := 0; i < la; i++ {
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, 0, -1)
}
la1 := la - 1 // Except the last line (gaps are free on it)
for j := 0; j < lb; j++ {
// Fill the first line with scores corresponding to a set of gaps
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gapPenalty, 1)
for i := 0; i < la1; i++ {
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
// log.Infof("LA: i : %d j : %d left : %d diag : %d top : %d\n", i, j, left, diag, top)
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j], scale)
left += gapPenalty
top += gapPenalty
switch {
case diag >= left && diag >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0)
case left >= diag && left >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, left, +1)
default:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, top, -1)
}
// log.Infof("LA: i : %d j : %d left : %d diag : %d top : %d [%d]\n", i, j, left, diag, top, _GetMatrix(scoreMatrix, la, i, j))
}
// Special case for the last line Left gap are free
left, diag, top := _GetMatrixFrom(scoreMatrix, la, la1, j)
diag += _PairingScorePeAlign(seqA[la1], qualA[la1], seqB[j], qualB[j], scale)
top += gapPenalty
switch {
case diag >= left && diag >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, la1, j, diag, 0)
case left >= diag && left >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, la1, j, left, +1)
default:
_SetMatrices(scoreMatrix, pathMatrix, la, la1, j, top, -1)
}
// log.Infof("LA: i : %d j : %d left : %d diag : %d top : %d [%d]\n", la1, j, left, diag, top, _GetMatrix(scoreMatrix, la, la1, j))
}
return _GetMatrix(scoreMatrix, la, la1, lb-1)
}
// Gaps at the beginning of A and at the end of B are free
// With A spanning over lines and B over columns
// - First line gap = 0
// - Last column gaps = 0
func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap, scale float64,
scoreMatrix, pathMatrix *[]int) int {
la := len(seqA)
lb := len(seqB)
// The actual gap score is the gap score times the mismatch between
// two bases with a score of 40
gapPenalty := int(scale*gap*float64(_NucScorePartMatchMismatch[40][40]) + 0.5)
needed := (la + 1) * (lb + 1)
if needed > cap(*scoreMatrix) {
*scoreMatrix = make([]int, needed)
}
if needed > cap(*pathMatrix) {
*pathMatrix = make([]int, needed)
}
*scoreMatrix = (*scoreMatrix)[:needed]
*pathMatrix = (*pathMatrix)[:needed]
// Sets the first position of the matrix with 0 score
_SetMatrices(scoreMatrix, pathMatrix, la, -1, -1, 0, 0)
// Fills the first column with scores corresponding to a set of gaps
for i := 0; i < la; i++ {
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gapPenalty, -1)
}
lb1 := lb - 1 // Except the last column (gaps are free on it)
for j := 0; j < lb1; j++ {
// Fill the first line with zero score
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, 0, 1)
for i := 0; i < la; i++ {
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j], scale)
left += gapPenalty
top += gapPenalty
switch {
case diag >= left && diag >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0)
case left >= diag && left >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, left, +1)
default:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, top, -1)
}
// log.Infof("LR: i : %d j : %d left : %d diag : %d top : %d [%d]\n", i, j, left, diag, top, _GetMatrix(scoreMatrix, la, i, j))
}
}
// Special case for the last colump Up gap are free
_SetMatrices(scoreMatrix, pathMatrix, la, -1, lb1, 0, 1)
for i := 0; i < la; i++ {
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, lb1)
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[lb1], qualB[lb1], scale)
left += gapPenalty
// log.Infof("LR: i : %d j : %d left : %d diag : %d top : %d [%d]\n", i, lb1, left, diag, top, _GetMatrix(scoreMatrix, la, i, lb1))
switch {
case diag >= left && diag >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, lb1, diag, 0)
case left >= diag && left >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, lb1, left, +1)
default:
_SetMatrices(scoreMatrix, pathMatrix, la, i, lb1, top, -1)
}
}
return _GetMatrix(scoreMatrix, la, la-1, lb1)
}
// Gaps at the beginning and at the end of seqA are free
// With seqA spanning over lines and seqB over columns
//
// SeqA must be the longer sequence. If that constraint is not
// respected, the function will panic.
//
// TO BE FINISHED
// - First column gap = 0
// - Last column gaps = 0
//
// Paths are encoded :
// - 0 : for diagonal
// - -1 : for top
// - +1 : for left
func _FillMatrixPeCenterAlign(seqA, qualA, seqB, qualB []byte, gap, scale float64,
scoreMatrix, pathMatrix *[]int) int {
la := len(seqA)
lb := len(seqB)
if len(seqA) < len(seqB) {
log.Panicf("len(seqA) < len(seqB) : %d < %d", len(seqA), len(seqB))
}
// The actual gap score is the gap score times the mismatch between
// two bases with a score of 40
gapPenalty := int(scale*gap*float64(_NucScorePartMatchMismatch[40][40]) + 0.5)
needed := (la + 1) * (lb + 1)
if needed > cap(*scoreMatrix) {
*scoreMatrix = make([]int, needed)
}
if needed > cap(*pathMatrix) {
*pathMatrix = make([]int, needed)
}
*scoreMatrix = (*scoreMatrix)[:needed]
*pathMatrix = (*pathMatrix)[:needed]
// Sets the first position of the matrix with 0 score
_SetMatrices(scoreMatrix, pathMatrix, la, -1, -1, 0, 0)
// Fills the first column with score 0
for i := 0; i < la; i++ {
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, 0, -1)
}
// la1 := la - 1 // Except the last line (gaps are free on it)
lb1 := lb - 1 // Except the last column (gaps are free on it)
for j := 0; j < lb1; j++ {
// Fill the first line with scores corresponding to a set of gaps
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gapPenalty, 1)
for i := 0; i < la; i++ {
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
// log.Infof("LA: i : %d j : %d left : %d diag : %d top : %d\n", i, j, left, diag, top)
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j], scale)
left += gapPenalty
top += gapPenalty
switch {
case diag >= left && diag >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0)
case left >= diag && left >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, left, +1)
default:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, top, -1)
}
// log.Infof("LA: i : %d j : %d left : %d diag : %d top : %d [%d]\n", i, j, left, diag, top, _GetMatrix(scoreMatrix, la, i, j))
}
}
for i := 0; i < la; i++ {
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, lb1)
// log.Infof("LA: i : %d j : %d left : %d diag : %d top : %d\n", i, j, left, diag, top)
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[lb1], qualB[lb1], scale)
left += gapPenalty
switch {
case diag >= left && diag >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, lb1, diag, 0)
case left >= diag && left >= top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, lb1, left, +1)
default:
_SetMatrices(scoreMatrix, pathMatrix, la, i, lb1, top, -1)
}
// log.Infof("LA: i : %d j : %d left : %d diag : %d top : %d [%d]\n", i, j, left, diag, top, _GetMatrix(scoreMatrix, la, i, j))
}
return _GetMatrix(scoreMatrix, la, la-1, lb1)
}
func PELeftAlign(seqA, seqB *obiseq.BioSequence, gap, scale float64,
arena PEAlignArena) (int, []int) {
if !_InitializedDnaScore {
_InitDNAScoreMatrix()
}
if arena.pointer == nil {
arena = MakePEAlignArena(seqA.Len(), seqB.Len())
}
score := _FillMatrixPeLeftAlign(seqA.Sequence(), seqA.Qualities(),
seqB.Sequence(), seqB.Qualities(), gap, scale,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
path := _Backtracking(arena.pointer.pathMatrix,
seqA.Len(), seqB.Len(),
&arena.pointer.path)
return score, path
}
func PERightAlign(seqA, seqB *obiseq.BioSequence, gap, scale float64,
arena PEAlignArena) (int, []int) {
if !_InitializedDnaScore {
_InitDNAScoreMatrix()
}
if arena.pointer == nil {
arena = MakePEAlignArena(seqA.Len(), seqB.Len())
}
score := _FillMatrixPeRightAlign(seqA.Sequence(), seqA.Qualities(),
seqB.Sequence(), seqB.Qualities(), gap, scale,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
path := _Backtracking(arena.pointer.pathMatrix,
seqA.Len(), seqB.Len(),
&arena.pointer.path)
return score, path
}
func PECenterAlign(seqA, seqB *obiseq.BioSequence, gap, scale float64,
arena PEAlignArena) (int, []int) {
if !_InitializedDnaScore {
_InitDNAScoreMatrix()
}
if arena.pointer == nil {
arena = MakePEAlignArena(seqA.Len(), seqB.Len())
}
score := _FillMatrixPeCenterAlign(seqA.Sequence(), seqA.Qualities(),
seqB.Sequence(), seqB.Qualities(), gap, scale,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
path := _Backtracking(arena.pointer.pathMatrix,
seqA.Len(), seqB.Len(),
&arena.pointer.path)
return score, path
}
func PEAlign(seqA, seqB *obiseq.BioSequence,
gap, scale float64, fastAlign bool, delta int, fastScoreRel bool,
arena PEAlignArena, shift_buff *map[int]int) (bool, int, []int, int, int, float64) {
var isLeftAlign bool
var score, shift int
var startA, startB int
var partLen, over int
var rawSeqA, qualSeqA []byte
var rawSeqB, qualSeqB []byte
var extra5, extra3 int
var path []int
if !_InitializedDnaScore {
_InitDNAScoreMatrix()
}
fastCount := -1
fastScore := -1.0
if fastAlign {
index := obikmer.Index4mer(seqA,
&arena.pointer.fastIndex,
&arena.pointer.fastBuffer)
shift, fastCount, fastScore = obikmer.FastShiftFourMer(index, shift_buff, seqA.Len(), seqB, fastScoreRel, nil)
if shift > 0 {
over = seqA.Len() - shift
} else {
over = seqB.Len() + shift
}
// At least one mismatch exists in the overlaping region
if fastCount+3 < over {
if shift > 0 {
startA = shift - delta
if startA < 0 {
startA = 0
}
extra5 = -startA
startB = 0
rawSeqA = seqA.Sequence()[startA:]
qualSeqA = seqA.Qualities()[startA:]
partLen = len(rawSeqA)
if partLen > seqB.Len() {
partLen = seqB.Len()
}
rawSeqB = seqB.Sequence()[0:partLen]
qualSeqB = seqB.Qualities()[0:partLen]
extra3 = seqB.Len() - partLen
isLeftAlign = true
score = _FillMatrixPeLeftAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, scale,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
} else {
startA = 0
startB = -shift - delta
if startB < 0 {
startB = 0
}
extra5 = startB
rawSeqB = seqB.Sequence()[startB:]
qualSeqB = seqB.Qualities()[startB:]
partLen = len(rawSeqB)
if partLen > seqA.Len() {
partLen = seqA.Len()
}
rawSeqA = seqA.Sequence()[:partLen]
qualSeqA = seqA.Qualities()[:partLen]
extra3 = partLen - seqA.Len()
isLeftAlign = false
score = _FillMatrixPeRightAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, scale,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
}
path = _Backtracking(arena.pointer.pathMatrix,
len(rawSeqA), len(rawSeqB),
&arena.pointer.path)
} else {
// Both overlaping regions are identicals
if shift > 0 {
startA = shift
startB = 0
extra5 = -startA
qualSeqA = seqA.Qualities()[startA:]
partLen = len(qualSeqA)
qualSeqB = seqB.Qualities()[0:partLen]
extra3 = seqB.Len() - partLen
score = 0
isLeftAlign = true
} else {
startA = 0
startB = -shift
extra5 = startB
qualSeqB = seqB.Qualities()[startB:]
partLen = len(qualSeqB)
extra3 = partLen - seqA.Len()
qualSeqA = seqA.Qualities()[:partLen]
isLeftAlign = false
}
score = 0
for i, qualA := range qualSeqA {
qualB := qualSeqB[i]
score += _NucScorePartMatchMatch[qualA][qualB]
}
path = arena.pointer.path[:0]
path = append(path, 0, partLen)
}
path[0] += extra5
if path[len(path)-1] == 0 {
path[len(path)-2] += extra3
} else {
path = append(path, extra3, 0)
}
} else {
//
// No Fast Heuristic
//
rawSeqA = seqA.Sequence()
qualSeqA = seqA.Qualities()
rawSeqB = seqB.Sequence()
qualSeqB = seqB.Qualities()
scoreR := _FillMatrixPeRightAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, scale,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
score = scoreR
path = _Backtracking(arena.pointer.pathMatrix,
len(rawSeqA), len(rawSeqB),
&(arena.pointer.path))
isLeftAlign = false
scoreL := _FillMatrixPeLeftAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, scale,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
if scoreL > scoreR {
path = _Backtracking(arena.pointer.pathMatrix,
len(rawSeqA), len(rawSeqB),
&(arena.pointer.path))
isLeftAlign = true
score = scoreL
}
}
return isLeftAlign, score, path, fastCount, over, fastScore
}