2022-01-15 19:10:16 +01:00
|
|
|
// obialign : function for aligning two sequences
|
|
|
|
//
|
|
|
|
// The obialign package provides a set of functions
|
|
|
|
// foor aligning two objects of type obiseq.BioSequence.
|
2022-01-13 23:27:39 +01:00
|
|
|
package obialign
|
|
|
|
|
|
|
|
import (
|
2022-10-12 23:01:47 +02:00
|
|
|
"fmt"
|
2022-01-13 23:27:39 +01:00
|
|
|
"math"
|
2022-10-12 23:01:47 +02:00
|
|
|
"strings"
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2023-11-29 12:14:37 +01:00
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
2022-01-13 23:27:39 +01:00
|
|
|
)
|
|
|
|
|
2023-08-30 19:59:46 +02:00
|
|
|
// // A pool of byte slices.
|
|
|
|
// var _BuildAlignArenaPool = sync.Pool{
|
|
|
|
// New: func() interface{} {
|
|
|
|
// bs := make([]byte, 0, 300)
|
|
|
|
// return &bs
|
|
|
|
// },
|
|
|
|
// }
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2023-08-30 19:59:46 +02:00
|
|
|
// _BuildAlignment builds the alignment between two sequences.
|
|
|
|
//
|
2022-07-31 21:52:04 +02:00
|
|
|
// It takes two sequences, a path, a gap character, and two buffers, and it builds the alignment by
|
2023-08-30 19:59:46 +02:00
|
|
|
// walking the path and copying the sequences into the buffers.
|
|
|
|
//
|
|
|
|
// Parameters:
|
|
|
|
// - seqA: a byte slice representing the first sequence.
|
|
|
|
// - seqB: a byte slice representing the second sequence.
|
|
|
|
// - path: a slice of integers representing the alignment path.
|
|
|
|
// - gap: a byte representing the gap character.
|
|
|
|
// - bufferA: a pointer to a byte slice for storing the aligned sequence A.
|
|
|
|
// - bufferB: a pointer to a byte slice for storing the aligned sequence B.
|
2022-01-15 19:10:16 +01:00
|
|
|
func _BuildAlignment(seqA, seqB []byte, path []int, gap byte, bufferA, bufferB *[]byte) {
|
2022-01-13 23:27:39 +01:00
|
|
|
|
|
|
|
*bufferA = (*bufferA)[:0]
|
|
|
|
*bufferB = (*bufferB)[:0]
|
|
|
|
|
|
|
|
lp := len(path)
|
|
|
|
|
2022-01-14 15:15:26 +01:00
|
|
|
posA := 0
|
|
|
|
posB := 0
|
2022-01-13 23:27:39 +01:00
|
|
|
for i := 0; i < lp; i++ {
|
|
|
|
step := path[i]
|
|
|
|
if step < 0 {
|
2022-01-14 15:15:26 +01:00
|
|
|
*bufferA = append(*bufferA, seqA[posA:(posA-step)]...)
|
2022-01-13 23:27:39 +01:00
|
|
|
for j := 0; j < -step; j++ {
|
|
|
|
*bufferB = append(*bufferB, gap)
|
|
|
|
}
|
2022-01-14 15:15:26 +01:00
|
|
|
posA -= step
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
if step > 0 {
|
2022-01-14 15:15:26 +01:00
|
|
|
*bufferB = append(*bufferB, seqB[posB:(posB+step)]...)
|
2022-01-13 23:27:39 +01:00
|
|
|
for j := 0; j < step; j++ {
|
|
|
|
*bufferA = append(*bufferA, gap)
|
|
|
|
}
|
2022-01-14 15:15:26 +01:00
|
|
|
posB += step
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
i++
|
|
|
|
step = path[i]
|
|
|
|
if step > 0 {
|
2022-01-14 15:15:26 +01:00
|
|
|
*bufferA = append(*bufferA, seqA[posA:(posA+step)]...)
|
|
|
|
*bufferB = append(*bufferB, seqB[posB:(posB+step)]...)
|
|
|
|
posA += step
|
|
|
|
posB += step
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-01-14 15:15:26 +01:00
|
|
|
// BuildAlignment builds the aligned sequences from an alignemnt path
|
|
|
|
// returned by one of the alignment procedure.
|
|
|
|
// The user has to provide both sequences (seqA and seqB), the alignment
|
|
|
|
// path (path), the symbole used to materialiaze gaps (gap) which is
|
|
|
|
// usually the dash '-', and a BuildAlignArena (arena). It is always possible
|
|
|
|
// to provide the NilBuildAlignArena instance for this last parameter.
|
|
|
|
// In that case an arena will be allocated by the function but, it will not
|
|
|
|
// be reusable for other alignments and desallocated at the BuildAlignment
|
|
|
|
// return.
|
2022-02-21 19:00:23 +01:00
|
|
|
func BuildAlignment(seqA, seqB *obiseq.BioSequence,
|
|
|
|
path []int, gap byte) (*obiseq.BioSequence, *obiseq.BioSequence) {
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2022-11-17 11:09:58 +01:00
|
|
|
bufferSA := obiseq.GetSlice(seqA.Len())
|
2022-02-21 19:00:23 +01:00
|
|
|
defer obiseq.RecycleSlice(&bufferSA)
|
2022-01-15 19:10:16 +01:00
|
|
|
|
2022-11-17 11:09:58 +01:00
|
|
|
bufferSB := obiseq.GetSlice(seqB.Len())
|
2022-02-21 19:00:23 +01:00
|
|
|
defer obiseq.RecycleSlice(&bufferSB)
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2022-01-15 19:10:16 +01:00
|
|
|
_BuildAlignment(seqA.Sequence(), seqB.Sequence(), path, gap,
|
2022-01-16 00:21:42 +01:00
|
|
|
&bufferSA,
|
|
|
|
&bufferSB)
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2022-02-21 19:00:23 +01:00
|
|
|
seqA = obiseq.NewBioSequence(seqA.Id(),
|
2022-01-16 00:21:42 +01:00
|
|
|
bufferSA,
|
2022-01-13 23:27:39 +01:00
|
|
|
seqA.Definition())
|
|
|
|
|
2022-02-21 19:00:23 +01:00
|
|
|
seqB = obiseq.NewBioSequence(seqB.Id(),
|
2022-01-16 00:21:42 +01:00
|
|
|
bufferSB,
|
2022-01-13 23:27:39 +01:00
|
|
|
seqB.Definition())
|
|
|
|
|
|
|
|
return seqA, seqB
|
|
|
|
}
|
|
|
|
|
2022-01-15 19:10:16 +01:00
|
|
|
// func _logSlice(x *[]byte) {
|
|
|
|
// l := len(*x)
|
|
|
|
// if l > 10 {
|
|
|
|
// l = 10
|
|
|
|
// }
|
|
|
|
// log.Printf("%v (%10s): slice=%p array=%p cap=%d len=%d\n", (*x)[:l], string((*x)[:l]), x, (*x), cap(*x), len(*x))
|
|
|
|
// }
|
|
|
|
|
2022-01-14 15:15:26 +01:00
|
|
|
// BuildQualityConsensus builds the consensus sequences corresponding to an
|
|
|
|
// alignement between two sequences.
|
|
|
|
// The consensus is built from an alignemnt path returned by one of the
|
|
|
|
// alignment procedure and the quality score associated to the sequence.
|
|
|
|
// In case of mismatches the nucleotide with the best score is conserved
|
|
|
|
// in the consensus. In case of score equality, an IUPAC symbol correesponding
|
|
|
|
// to the ambiguity is used.
|
|
|
|
// The user has to provide both sequences (seqA and seqB), the alignment
|
|
|
|
// path (path), and two BuildAlignArena (arena1 and arena2). It is always possible
|
|
|
|
// to provide the NilBuildAlignArena instance for these two last parameters.
|
|
|
|
// In that case arenas will be allocated by the function but, they will not
|
|
|
|
// be reusable for other alignments and desallocated at the BuildQualityConsensus
|
|
|
|
// return.
|
2022-10-12 23:01:47 +02:00
|
|
|
func BuildQualityConsensus(seqA, seqB *obiseq.BioSequence, path []int, statOnMismatch bool) (*obiseq.BioSequence, int) {
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2022-11-17 11:09:58 +01:00
|
|
|
bufferSA := obiseq.GetSlice(seqA.Len())
|
|
|
|
bufferSB := obiseq.GetSlice(seqB.Len())
|
2022-02-21 19:00:23 +01:00
|
|
|
defer obiseq.RecycleSlice(&bufferSB)
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2022-11-17 11:09:58 +01:00
|
|
|
bufferQA := obiseq.GetSlice(seqA.Len())
|
|
|
|
bufferQB := obiseq.GetSlice(seqB.Len())
|
2022-02-21 19:00:23 +01:00
|
|
|
defer obiseq.RecycleSlice(&bufferQB)
|
2022-01-15 19:10:16 +01:00
|
|
|
|
|
|
|
_BuildAlignment(seqA.Sequence(), seqB.Sequence(), path, ' ',
|
2022-01-16 00:21:42 +01:00
|
|
|
&bufferSA, &bufferSB)
|
2022-01-15 19:10:16 +01:00
|
|
|
|
|
|
|
// log.Printf("#1 %s--> la : %d,%p lb : %d,%p qa : %d,%p qb : %d,%p\n", stamp,
|
|
|
|
// len(*bufferSA), bufferSA, len(*bufferSB), bufferSB,
|
|
|
|
// len(*bufferQA), bufferQA, len(*bufferQB), bufferQB)
|
|
|
|
|
|
|
|
_BuildAlignment(seqA.Qualities(), seqB.Qualities(), path, byte(0),
|
2022-01-16 00:21:42 +01:00
|
|
|
&bufferQA, &bufferQB)
|
2022-01-15 19:10:16 +01:00
|
|
|
|
|
|
|
// log.Printf("#2 %s--> la : %d,%p lb : %d,%p qa : %d,%p qb : %d,%p\n", stamp,
|
|
|
|
// len(*bufferSA), bufferSA, len(*bufferSB), bufferSB,
|
|
|
|
// len(*bufferQA), bufferQA, len(*bufferQB), bufferQB)
|
|
|
|
// log.Printf("#3 %s--> la : %d lb : %d, qa : %d qb : %d\n", stamp, len(sA), len(sB), len(qsA), len(qsB))
|
2022-01-13 23:27:39 +01:00
|
|
|
|
|
|
|
var qA, qB byte
|
|
|
|
var qM, qm byte
|
|
|
|
var i int
|
|
|
|
|
2022-10-12 23:01:47 +02:00
|
|
|
mismatches := make(map[string]int)
|
|
|
|
|
2022-01-13 23:27:39 +01:00
|
|
|
match := 0
|
|
|
|
|
2022-01-16 00:21:42 +01:00
|
|
|
for i, qA = range bufferQA {
|
|
|
|
nA := bufferSA[i]
|
|
|
|
nB := bufferSB[i]
|
|
|
|
qB = bufferQB[i]
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2022-10-12 23:01:47 +02:00
|
|
|
if statOnMismatch && nA != nB && nA != ' ' && nB != ' ' {
|
|
|
|
mismatches[strings.ToUpper(fmt.Sprintf("(%c:%02d)->(%c:%02d)", nA, qA, nB, qB))] = i + 1
|
|
|
|
}
|
|
|
|
|
2022-01-13 23:27:39 +01:00
|
|
|
if qA > qB {
|
|
|
|
qM = qA
|
|
|
|
qm = qB
|
|
|
|
}
|
|
|
|
if qB > qA {
|
2022-01-16 00:21:42 +01:00
|
|
|
bufferSA[i] = bufferSB[i]
|
2022-01-13 23:27:39 +01:00
|
|
|
qM = qB
|
|
|
|
qm = qA
|
|
|
|
}
|
2022-01-15 19:10:16 +01:00
|
|
|
if qB == qA && nA != nB {
|
|
|
|
nuc := _FourBitsBaseCode[nA&31] | _FourBitsBaseCode[nB&31]
|
2022-01-16 00:21:42 +01:00
|
|
|
bufferSA[i] = _FourBitsBaseDecode[nuc]
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
q := qA + qB
|
|
|
|
|
|
|
|
if qA > 0 && qB > 0 {
|
2022-01-15 19:10:16 +01:00
|
|
|
if nA != nB {
|
2022-01-13 23:27:39 +01:00
|
|
|
q = qM - byte(math.Log10(1-math.Pow(10, -float64(qm)/30))*10+0.5)
|
|
|
|
}
|
2022-01-15 19:10:16 +01:00
|
|
|
if nA == nB {
|
2022-01-13 23:27:39 +01:00
|
|
|
match++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if q > 90 {
|
|
|
|
q = 90
|
|
|
|
}
|
2022-01-15 19:10:16 +01:00
|
|
|
|
2022-01-16 00:21:42 +01:00
|
|
|
bufferQA[i] = q
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
|
|
|
|
2022-02-21 19:00:23 +01:00
|
|
|
consSeq := obiseq.NewBioSequence(
|
2022-01-15 19:10:16 +01:00
|
|
|
seqA.Id(),
|
2022-01-16 00:21:42 +01:00
|
|
|
bufferSA,
|
2022-01-15 19:10:16 +01:00
|
|
|
seqA.Definition(),
|
|
|
|
)
|
2022-01-16 00:21:42 +01:00
|
|
|
consSeq.SetQualities(bufferQA)
|
2022-01-13 23:27:39 +01:00
|
|
|
|
2022-10-12 23:01:47 +02:00
|
|
|
if statOnMismatch && len(mismatches) > 0 {
|
|
|
|
consSeq.SetAttribute("pairing_mismatches", mismatches)
|
|
|
|
}
|
|
|
|
|
2022-01-15 19:10:16 +01:00
|
|
|
return consSeq, match
|
2022-01-13 23:27:39 +01:00
|
|
|
}
|
2022-10-12 23:01:47 +02:00
|
|
|
|
|
|
|
// func BuildCigar(seqA, seqB *obiseq.BioSequence, path []int) string {
|
|
|
|
|
|
|
|
// lp := len(path)
|
|
|
|
|
|
|
|
// posA := 0
|
|
|
|
// posB := 0
|
|
|
|
// oldStep := ' '
|
|
|
|
// kstep := ' '
|
|
|
|
// for i := 0; i < lp; i++ {
|
|
|
|
// step := path[i]
|
|
|
|
|
|
|
|
// if step < 0 {
|
|
|
|
// kstep='D'
|
|
|
|
// posA -= step
|
|
|
|
// }
|
|
|
|
// if step > 0 {
|
|
|
|
// kstep='I'
|
|
|
|
// posB += step
|
|
|
|
// }
|
|
|
|
|
|
|
|
// i++
|
|
|
|
// step = path[i]
|
|
|
|
// if step > 0 {
|
|
|
|
// kstep = 'M'
|
|
|
|
// posA += step
|
|
|
|
// posB += step
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
// }
|