First commit

This commit is contained in:
2022-01-13 23:27:39 +01:00
parent dab6549cad
commit f53bf1b804
93 changed files with 11042 additions and 0 deletions

167
pkg/obialign/alignment.go Normal file
View File

@@ -0,0 +1,167 @@
package obialign
import (
"math"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
type __build_align_arena__ struct {
bufferA []byte
bufferB []byte
}
type BuildAlignArena struct {
pointer *__build_align_arena__
}
var NilBuildAlignArena = BuildAlignArena{nil}
func MakeBuildAlignArena(lseqA, lseqB int) BuildAlignArena {
a := __build_align_arena__{
bufferA: make([]byte, lseqA+lseqB),
bufferB: make([]byte, lseqA+lseqB),
}
return BuildAlignArena{&a}
}
func __build_alignment__(seqA, seqB []byte, path []int, gap byte,
bufferA, bufferB *[]byte) ([]byte, []byte) {
if bufferA == nil {
b := make([]byte, 0, len(seqA)+len(seqB))
bufferA = &b
}
if bufferB == nil {
b := make([]byte, 0, len(seqA)+len(seqB))
bufferB = &b
}
*bufferA = (*bufferA)[:0]
*bufferB = (*bufferB)[:0]
lp := len(path)
pos_a := 0
pos_b := 0
for i := 0; i < lp; i++ {
step := path[i]
if step < 0 {
*bufferA = append(*bufferA, seqA[pos_a:(pos_a-step)]...)
for j := 0; j < -step; j++ {
*bufferB = append(*bufferB, gap)
}
pos_a -= step
}
if step > 0 {
*bufferB = append(*bufferB, seqB[pos_b:(pos_b+step)]...)
for j := 0; j < step; j++ {
*bufferA = append(*bufferA, gap)
}
pos_b += step
}
i++
step = path[i]
if step > 0 {
*bufferA = append(*bufferA, seqA[pos_a:(pos_a+step)]...)
*bufferB = append(*bufferB, seqB[pos_b:(pos_b+step)]...)
pos_a += step
pos_b += step
}
}
return *bufferA, *bufferB
}
func BuildAlignment(seqA, seqB obiseq.BioSequence,
path []int, gap byte, arena BuildAlignArena) (obiseq.BioSequence, obiseq.BioSequence) {
if arena.pointer == nil {
arena = MakeBuildAlignArena(seqA.Length(), seqB.Length())
}
A, B := __build_alignment__(seqA.Sequence(), seqB.Sequence(), path, gap,
&arena.pointer.bufferA,
&arena.pointer.bufferB)
seqA = obiseq.MakeBioSequence(seqA.Id(),
A,
seqA.Definition())
seqB = obiseq.MakeBioSequence(seqB.Id(),
B,
seqB.Definition())
return seqA, seqB
}
func BuildQualityConsensus(seqA, seqB obiseq.BioSequence, path []int,
arena1, arena2 BuildAlignArena) (obiseq.BioSequence, int) {
if arena1.pointer == nil {
arena1 = MakeBuildAlignArena(seqA.Length(), seqB.Length())
}
if arena2.pointer == nil {
arena2 = MakeBuildAlignArena(seqA.Length(), seqB.Length())
}
sA, sB := __build_alignment__(seqA.Sequence(), seqB.Sequence(), path, ' ',
&arena1.pointer.bufferA,
&arena1.pointer.bufferB)
qsA, qsB := __build_alignment__(seqA.Qualities(), seqB.Qualities(), path, byte(0),
&arena2.pointer.bufferA,
&arena2.pointer.bufferB)
consensus := make([]byte, 0, len(sA))
qualities := make([]byte, 0, len(sA))
var qA, qB byte
var qM, qm byte
var i int
match := 0
for i, qA = range qsA {
qB = qsB[i]
if qA > qB {
consensus = append(consensus, sA[i])
qM = qA
qm = qB
}
if qB > qA {
consensus = append(consensus, sB[i])
qM = qB
qm = qA
}
if qB == qA {
nuc := __four_bits_base_code__[sA[i]&31] | __four_bits_base_code__[sB[i]&31]
consensus = append(consensus, __four_bits_base_decode__[nuc])
}
q := qA + qB
if qA > 0 && qB > 0 {
if sA[i] != sB[i] {
q = qM - byte(math.Log10(1-math.Pow(10, -float64(qm)/30))*10+0.5)
}
if sA[i] == sB[i] {
match++
}
}
if q > 90 {
q = 90
}
qualities = append(qualities, q)
}
seq := obiseq.MakeBioSequence(seqA.Id(), consensus, seqA.Definition())
seq.SetQualities(qualities)
return seq, match
}

View File

@@ -0,0 +1,95 @@
package obialign
func __backtracking__(path_matrix []int, lseqA, lseqB int, path *[]int) []int {
needed := (lseqA + lseqB) * 2
if needed > cap(*path) {
*path = make([]int, 0, needed)
}
*path = (*path)[:cap(*path)]
p := cap(*path)
i := lseqA - 1
j := lseqB - 1
ldiag := 0
lup := 0
lleft := 0
for i > -1 || j > -1 {
step := __get_matrix__(&path_matrix, lseqA, i, j)
// log.Printf("I: %d J:%d -> %d\n", i, j, step)
switch {
case step == 0:
if lleft != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lleft
lleft = 0
ldiag = 0
}
if lup != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lup
lup = 0
ldiag = 0
}
ldiag++
i--
j--
case step > 0:
if lup != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lup
lup = 0
ldiag = 0
}
lleft += step
j -= step
case step < 0:
if lleft != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lleft
lleft = 0
ldiag = 0
}
lup += step
i += step
}
}
if lleft != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lleft
ldiag = 0
}
if lup != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = lup
ldiag = 0
}
if ldiag != 0 {
p--
(*path)[p] = ldiag
p--
(*path)[p] = 0
}
*path = (*path)[p:cap((*path))]
return *path
}

100
pkg/obialign/dnamatrix.go Normal file
View File

@@ -0,0 +1,100 @@
package obialign
import (
"math"
)
var __four_bits_count__ = []float64{
0, // 0000
1, // 0001
1, // 0010
2, // 0011
1, // 0100
2, // 0101
2, // 0110
3, // 0111
1, // 1000
2, // 1001
2, // 1010
3, // 1011
2, // 1100
3, // 1101
3, // 1110
4, // 1111
}
var __initialized_dna_score__ = false
var __nuc_part_match__ [32][32]float64
var __nuc_score_part_match_match__ [100][100]int
var __nuc_score_part_match_mismatch__ [100][100]int
func __match_ratio__(a, b byte) float64 {
// count of common bits
cm := __four_bits_count__[a&b&15]
ca := __four_bits_count__[a&15]
cb := __four_bits_count__[b&15]
if cm == 0 || ca == 0 || cb == 0 {
return float64(0)
}
return float64(cm) / float64(ca) / float64(cb)
}
func __logaddexp__(a, b float64) float64 {
if a > b {
a, b = b, a
}
return b + math.Log1p(math.Exp(a-b))
}
func __match_score_ratio__(a, b byte) (float64, float64) {
l2 := math.Log(2)
l3 := math.Log(3)
l4 := math.Log(4)
l10 := math.Log(10)
lE1 := -float64(a)/10*l10 - l4
lE2 := -float64(b)/10*l10 - l4
lO1 := math.Log1p(-math.Exp(lE1 + l3))
lO2 := math.Log1p(-math.Exp(lE2 + l3))
lO1O2 := lO1 + lO2
lE1E2 := lE1 + lE2
lO1E2 := lO1 + lE2
lO2E1 := lO2 + lE1
MM := __logaddexp__(lO1O2, lE1E2+l3) + l4
Mm := __logaddexp__(__logaddexp__(lO1E2, lO2E1), lE1E2+l2) + l4
return MM, Mm
}
func __init_nuc_part_match__() {
for i, a := range __four_bits_base_code__ {
for j, b := range __four_bits_base_code__ {
__nuc_part_match__[i][j] = __match_ratio__(a, b)
}
}
}
func __init_nuc_score_part_match__() {
for i := 0; i < 100; i++ {
for j := 0; j < 100; j++ {
MM, Mm := __match_score_ratio__(byte(i), byte(j))
__nuc_score_part_match_match__[i][j] = int(MM*10 + 0.5)
__nuc_score_part_match_mismatch__[i][j] = int(Mm*10 + 0.5)
}
}
}
func InitDNAScoreMatrix() {
if !__initialized_dna_score__ {
__init_nuc_part_match__()
__init_nuc_score_part_match__()
__initialized_dna_score__ = true
}
}

View File

@@ -0,0 +1,74 @@
package obialign
import (
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
var __four_bits_base_code__ = []byte{0b0000,
// IUPAC nucleotide code Base
0b0001, // A Adenine
0b1110, // B C or G or T
0b0010, // C Cytosine
0b1101, // D A or G or T
0b0000, // E not a nucleotide
0b0000, // F not a nucleotide
0b0100, // G Guanine
0b1011, // H A or C or T
0b0000, // I not a nucleotide
0b0000, // J not a nucleotide
0b1100, // K G or T
0b0000, // L not a nucleotide
0b0011, // M A or C
0b1111, // N any base
0b0000, // O not a nucleotide
0b0000, // P not a nucleotide
0b0000, // Q not a nucleotide
0b0101, // R A or G
0b0110, // S G or C
0b1000, // T Thymine
0b1000, // U Uracil
0b0111, // V A or C or G
0b1001, // W A or T
0b0000, // X not a nucleotide
0b1010, // Y C or T
0b0000, // Z not a nucleotide
0b0000,
0b0000,
0b0000,
0b0000,
0b0000}
var __four_bits_base_decode__ = []byte{
// 0b0000 0b0001 0b0010 0b0011
'.', 'a', 'c', 'm',
// 0b0100 0b0101 0b0110 0b0111
'g', 'r', 's', 'v',
// 0b1000 0b1001 0b1010 0b1011
't', 'w', 'y', 'h',
// 0b1100 0b1101 0b1110 0b1111
'k', 'd', 'b', 'n',
}
func Encode4bits(seq obiseq.BioSequence, buffer []byte) []byte {
length := seq.Length()
rawseq := seq.Sequence()
if buffer == nil {
buffer = make([]byte, 0, length)
} else {
buffer = buffer[:0]
}
var code byte
for _, nuc := range rawseq {
if nuc == '.' || nuc == '-' {
code = 0
} else {
code = __four_bits_base_code__[nuc&31]
}
buffer = append(buffer, code)
}
return buffer
}

View File

@@ -0,0 +1,366 @@
package obialign
import (
"log"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obikmer"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
type __pe_align_arena__ struct {
score_matrix []int
path_matrix []int
path []int
fast_index [][]int
fast_buffer []byte
}
type PEAlignArena struct {
pointer *__pe_align_arena__
}
var NilPEAlignArena = PEAlignArena{nil}
func MakePEAlignArena(lseqA, lseqB int) PEAlignArena {
a := __pe_align_arena__{
score_matrix: make([]int, 0, (lseqA+1)*(lseqB+1)),
path_matrix: make([]int, 0, (lseqA+1)*(lseqB+1)),
path: make([]int, 2*(lseqA+lseqB)),
fast_index: make([][]int, 256),
fast_buffer: make([]byte, 0, lseqA),
}
return PEAlignArena{&a}
}
func __set_matrices__(matrixA, matrixB *[]int, lenA, a, b, valueA, valueB int) {
i := (b+1)*(lenA+1) + a + 1
(*matrixA)[i] = valueA
(*matrixB)[i] = valueB
}
func __get_matrix__(matrix *[]int, lenA, a, b int) int {
return (*matrix)[(b+1)*(lenA+1)+a+1]
}
func __get_matrix_from__(matrix *[]int, lenA, a, b int) (int, int, int) {
i := (b+1)*(lenA+1) + a
j := i - lenA
m := *matrix
return m[j], m[j-1], m[i]
}
func __pairing_score_pe_align__(baseA, qualA, baseB, qualB byte) int {
part_match := __nuc_part_match__[baseA&31][baseB&31]
// log.Printf("id : %f A : %s %d B : %s %d\n", part_match, string(baseA), qualA, string(baseB), qualB)
switch {
case part_match == 1:
// log.Printf("match\n")
return __nuc_score_part_match_match__[qualA][qualB]
case part_match == 0:
return __nuc_score_part_match_mismatch__[qualA][qualB]
default:
return int(part_match*float64(__nuc_score_part_match_match__[qualA][qualB]) +
(1-part_match)*float64(__nuc_score_part_match_mismatch__[qualA][qualB]) + 0.5)
}
}
func __fill_matrix_pe_left_align__(seqA, qualA, seqB, qualB []byte, gap int,
score_matrix, path_matrix *[]int) int {
la := len(seqA)
lb := len(seqB)
// The actual gap score is the gap score times the mismatch between
// two bases with a score of 40
gap = gap * __nuc_score_part_match_mismatch__[40][40]
needed := (la + 1) * (lb + 1)
if needed > cap(*score_matrix) {
*score_matrix = make([]int, needed)
}
if needed > cap(*path_matrix) {
*path_matrix = make([]int, needed)
}
*score_matrix = (*score_matrix)[:needed]
*path_matrix = (*path_matrix)[:needed]
__set_matrices__(score_matrix, path_matrix, la, -1, -1, 0, 0)
// Fills the first column with score 0
for i := 0; i < la; i++ {
__set_matrices__(score_matrix, path_matrix, la, i, -1, 0, -1)
}
la1 := la - 1
for j := 0; j < lb; j++ {
__set_matrices__(score_matrix, path_matrix, la, -1, j, (j+1)*gap, 1)
for i := 0; i < la1; i++ {
left, diag, top := __get_matrix_from__(score_matrix, la, i, j)
diag += __pairing_score_pe_align__(seqA[i], qualA[i], seqB[j], qualB[j])
left += gap
top += gap
switch {
case diag > left && diag > top:
__set_matrices__(score_matrix, path_matrix, la, i, j, diag, 0)
case left > diag && left > top:
__set_matrices__(score_matrix, path_matrix, la, i, j, left, +1)
default:
__set_matrices__(score_matrix, path_matrix, la, i, j, top, -1)
}
}
// Special case for the last line Left gap are free
left, diag, top := __get_matrix_from__(score_matrix, la, la1, j)
diag += __pairing_score_pe_align__(seqA[la1], qualA[la1], seqB[j], qualB[j])
top += gap
switch {
case diag > left && diag > top:
__set_matrices__(score_matrix, path_matrix, la, la1, j, diag, 0)
case left > diag && left > top:
__set_matrices__(score_matrix, path_matrix, la, la1, j, left, +1)
default:
__set_matrices__(score_matrix, path_matrix, la, la1, j, top, -1)
}
}
return __get_matrix__(score_matrix, la, la1, lb-1)
}
func __fill_matrix_pe_right_align__(seqA, qualA, seqB, qualB []byte, gap int,
score_matrix, path_matrix *[]int) int {
la := len(seqA)
lb := len(seqB)
// The actual gap score is the gap score times the mismatch between
// two bases with a score of 40
gap = gap * __nuc_score_part_match_mismatch__[40][40]
needed := (la + 1) * (lb + 1)
if needed > cap(*score_matrix) {
*score_matrix = make([]int, needed)
}
if needed > cap(*path_matrix) {
*path_matrix = make([]int, needed)
}
*score_matrix = (*score_matrix)[:needed]
*path_matrix = (*path_matrix)[:needed]
__set_matrices__(score_matrix, path_matrix, la, -1, -1, 0, 0)
// Fills the first column with score 0
for i := 0; i < la; i++ {
__set_matrices__(score_matrix, path_matrix, la, i, -1, (i+1)*gap, -1)
}
lb1 := lb - 1
for j := 0; j < lb1; j++ {
__set_matrices__(score_matrix, path_matrix, la, -1, j, 0, 1)
for i := 0; i < la; i++ {
left, diag, top := __get_matrix_from__(score_matrix, la, i, j)
diag += __pairing_score_pe_align__(seqA[i], qualA[i], seqB[j], qualB[j])
left += gap
top += gap
switch {
case diag > left && left > top:
__set_matrices__(score_matrix, path_matrix, la, i, j, diag, 0)
case left > diag && left > top:
__set_matrices__(score_matrix, path_matrix, la, i, j, left, +1)
default:
__set_matrices__(score_matrix, path_matrix, la, i, j, top, -1)
}
}
}
// Special case for the last colump Up gap are free
__set_matrices__(score_matrix, path_matrix, la, -1, lb1, 0, 1)
for i := 0; i < la; i++ {
left, diag, top := __get_matrix_from__(score_matrix, la, i, lb1)
diag += __pairing_score_pe_align__(seqA[i], qualA[i], seqB[lb1], qualB[lb1])
left += gap
switch {
case diag > left && diag > top:
__set_matrices__(score_matrix, path_matrix, la, i, lb1, diag, 0)
case left > diag && left > top:
__set_matrices__(score_matrix, path_matrix, la, i, lb1, left, +1)
default:
__set_matrices__(score_matrix, path_matrix, la, i, lb1, top, -1)
}
}
return __get_matrix__(score_matrix, la, la-1, lb1)
}
func PELeftAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) {
if !__initialized_dna_score__ {
log.Println("Initializing the DNA Scoring matrix")
InitDNAScoreMatrix()
}
if arena.pointer == nil {
arena = MakePEAlignArena(seqA.Length(), seqB.Length())
}
score := __fill_matrix_pe_left_align__(seqA.Sequence(), seqA.Qualities(),
seqB.Sequence(), seqB.Qualities(), gap,
&arena.pointer.score_matrix,
&arena.pointer.path_matrix)
arena.pointer.path = __backtracking__(arena.pointer.path_matrix,
seqA.Length(), seqB.Length(),
&arena.pointer.path)
return score, arena.pointer.path
}
func PERightAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) {
if !__initialized_dna_score__ {
log.Println("Initializing the DNA Scoring matrix")
InitDNAScoreMatrix()
}
if arena.pointer == nil {
arena = MakePEAlignArena(seqA.Length(), seqB.Length())
}
score := __fill_matrix_pe_right_align__(seqA.Sequence(), seqA.Qualities(),
seqB.Sequence(), seqB.Qualities(), gap,
&arena.pointer.score_matrix,
&arena.pointer.path_matrix)
arena.pointer.path = __backtracking__(arena.pointer.path_matrix,
seqA.Length(), seqB.Length(),
&arena.pointer.path)
return score, arena.pointer.path
}
func PEAlign(seqA, seqB obiseq.BioSequence,
gap, delta int,
arena PEAlignArena) (int, []int) {
var score, shift int
var startA, startB int
var part_len, over int
var raw_seqA, qual_seqA []byte
var raw_seqB, qual_seqB []byte
var extra5, extra3 int
if !__initialized_dna_score__ {
log.Println("Initializing the DNA Scoring matrix")
InitDNAScoreMatrix()
}
index := obikmer.Index4mer(seqA,
&arena.pointer.fast_index,
&arena.pointer.fast_buffer)
shift, fast_score := obikmer.FastShiftFourMer(index, seqB, nil)
if shift > 0 {
over = seqA.Length() - shift
} else {
over = seqB.Length() + shift
}
if fast_score+3 < over {
if shift > 0 {
startA = shift - delta
if startA < 0 {
startA = 0
}
extra5 = -startA
startB = 0
raw_seqA = seqA.Sequence()[startA:]
qual_seqA = seqA.Qualities()[startA:]
part_len = len(raw_seqA)
raw_seqB = seqB.Sequence()[0:part_len]
qual_seqB = seqB.Qualities()[0:part_len]
extra3 = seqB.Length() - part_len
score = __fill_matrix_pe_left_align__(
raw_seqA, qual_seqA, raw_seqB, qual_seqB, gap,
&arena.pointer.score_matrix,
&arena.pointer.path_matrix)
} else {
startA = 0
startB = -shift - delta
if startB < 0 {
startB = 0
}
extra5 = startB
raw_seqB = seqB.Sequence()[startB:]
qual_seqB = seqB.Qualities()[startB:]
part_len = len(raw_seqB)
raw_seqA = seqA.Sequence()[:part_len]
qual_seqA = seqA.Qualities()[:part_len]
extra3 = part_len - seqA.Length()
score = __fill_matrix_pe_right_align__(
raw_seqA, qual_seqA, raw_seqB, qual_seqB, gap,
&arena.pointer.score_matrix,
&arena.pointer.path_matrix)
}
arena.pointer.path = __backtracking__(arena.pointer.path_matrix,
len(raw_seqA), len(raw_seqB),
&arena.pointer.path)
} else {
if shift > 0 {
startA = shift
startB = 0
extra5 = -startA
qual_seqA = seqA.Qualities()[startA:]
part_len = len(qual_seqA)
qual_seqB = seqB.Qualities()[0:part_len]
extra3 = seqB.Length() - part_len
score = 0
} else {
startA = 0
startB = -shift
extra5 = startB
qual_seqB = seqB.Qualities()[startB:]
part_len = len(qual_seqB)
extra3 = part_len - seqA.Length()
qual_seqA = seqA.Qualities()[:part_len]
}
score = 0
for i, qualA := range qual_seqA {
qualB := qual_seqB[i]
score += __nuc_score_part_match_match__[qualA][qualB]
}
arena.pointer.path = arena.pointer.path[:0]
arena.pointer.path = append(arena.pointer.path, 0, part_len)
}
arena.pointer.path[0] += extra5
if arena.pointer.path[len(arena.pointer.path)-1] == 0 {
arena.pointer.path[len(arena.pointer.path)-2] += extra3
} else {
arena.pointer.path = append(arena.pointer.path, extra3, 0)
}
return score, arena.pointer.path
}