Files
obitools4/pkg/obialign/alignment.go
2022-01-14 15:36:39 +01:00

198 lines
5.1 KiB
Go

package obialign
import (
"math"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
type _BuildAlignArena struct {
bufferA []byte
bufferB []byte
}
// BuildAlignArena define memory arena usable by the
// BuildAlignment function. The same arena can be reused
// from alignment to alignment to limit memory allocation
// and desallocation process.
type BuildAlignArena struct {
pointer *_BuildAlignArena
}
// NilBuildAlignArena is the nil instance of the BuildAlignArena
// type.
var NilBuildAlignArena = BuildAlignArena{nil}
// MakeBuildAlignArena make a new arena for aligning two sequences
// of maximum length indicated by lseqA and lseqB.
func MakeBuildAlignArena(lseqA, lseqB int) BuildAlignArena {
a := _BuildAlignArena{
bufferA: make([]byte, lseqA+lseqB),
bufferB: make([]byte, lseqA+lseqB),
}
return BuildAlignArena{&a}
}
func _BuildAlignment(seqA, seqB []byte, path []int, gap byte,
bufferA, bufferB *[]byte) ([]byte, []byte) {
if bufferA == nil {
b := make([]byte, 0, len(seqA)+len(seqB))
bufferA = &b
}
if bufferB == nil {
b := make([]byte, 0, len(seqA)+len(seqB))
bufferB = &b
}
*bufferA = (*bufferA)[:0]
*bufferB = (*bufferB)[:0]
lp := len(path)
posA := 0
posB := 0
for i := 0; i < lp; i++ {
step := path[i]
if step < 0 {
*bufferA = append(*bufferA, seqA[posA:(posA-step)]...)
for j := 0; j < -step; j++ {
*bufferB = append(*bufferB, gap)
}
posA -= step
}
if step > 0 {
*bufferB = append(*bufferB, seqB[posB:(posB+step)]...)
for j := 0; j < step; j++ {
*bufferA = append(*bufferA, gap)
}
posB += step
}
i++
step = path[i]
if step > 0 {
*bufferA = append(*bufferA, seqA[posA:(posA+step)]...)
*bufferB = append(*bufferB, seqB[posB:(posB+step)]...)
posA += step
posB += step
}
}
return *bufferA, *bufferB
}
// BuildAlignment builds the aligned sequences from an alignemnt path
// returned by one of the alignment procedure.
// The user has to provide both sequences (seqA and seqB), the alignment
// path (path), the symbole used to materialiaze gaps (gap) which is
// usually the dash '-', and a BuildAlignArena (arena). It is always possible
// to provide the NilBuildAlignArena instance for this last parameter.
// In that case an arena will be allocated by the function but, it will not
// be reusable for other alignments and desallocated at the BuildAlignment
// return.
func BuildAlignment(seqA, seqB obiseq.BioSequence,
path []int, gap byte, arena BuildAlignArena) (obiseq.BioSequence, obiseq.BioSequence) {
if arena.pointer == nil {
arena = MakeBuildAlignArena(seqA.Length(), seqB.Length())
}
A, B := _BuildAlignment(seqA.Sequence(), seqB.Sequence(), path, gap,
&arena.pointer.bufferA,
&arena.pointer.bufferB)
seqA = obiseq.MakeBioSequence(seqA.Id(),
A,
seqA.Definition())
seqB = obiseq.MakeBioSequence(seqB.Id(),
B,
seqB.Definition())
return seqA, seqB
}
// BuildQualityConsensus builds the consensus sequences corresponding to an
// alignement between two sequences.
// The consensus is built from an alignemnt path returned by one of the
// alignment procedure and the quality score associated to the sequence.
// In case of mismatches the nucleotide with the best score is conserved
// in the consensus. In case of score equality, an IUPAC symbol correesponding
// to the ambiguity is used.
// The user has to provide both sequences (seqA and seqB), the alignment
// path (path), and two BuildAlignArena (arena1 and arena2). It is always possible
// to provide the NilBuildAlignArena instance for these two last parameters.
// In that case arenas will be allocated by the function but, they will not
// be reusable for other alignments and desallocated at the BuildQualityConsensus
// return.
func BuildQualityConsensus(seqA, seqB obiseq.BioSequence, path []int,
arena1, arena2 BuildAlignArena) (obiseq.BioSequence, int) {
if arena1.pointer == nil {
arena1 = MakeBuildAlignArena(seqA.Length(), seqB.Length())
}
if arena2.pointer == nil {
arena2 = MakeBuildAlignArena(seqA.Length(), seqB.Length())
}
sA, sB := _BuildAlignment(seqA.Sequence(), seqB.Sequence(), path, ' ',
&arena1.pointer.bufferA,
&arena1.pointer.bufferB)
qsA, qsB := _BuildAlignment(seqA.Qualities(), seqB.Qualities(), path, byte(0),
&arena2.pointer.bufferA,
&arena2.pointer.bufferB)
consensus := make([]byte, 0, len(sA))
qualities := make([]byte, 0, len(sA))
var qA, qB byte
var qM, qm byte
var i int
match := 0
for i, qA = range qsA {
qB = qsB[i]
if qA > qB {
consensus = append(consensus, sA[i])
qM = qA
qm = qB
}
if qB > qA {
consensus = append(consensus, sB[i])
qM = qB
qm = qA
}
if qB == qA {
nuc := _FourBitsBaseCode[sA[i]&31] | _FourBitsBaseCode[sB[i]&31]
consensus = append(consensus, _FourBitsBaseDecode[nuc])
}
q := qA + qB
if qA > 0 && qB > 0 {
if sA[i] != sB[i] {
q = qM - byte(math.Log10(1-math.Pow(10, -float64(qm)/30))*10+0.5)
}
if sA[i] == sB[i] {
match++
}
}
if q > 90 {
q = 90
}
qualities = append(qualities, q)
}
seq := obiseq.MakeBioSequence(seqA.Id(), consensus, seqA.Definition())
seq.SetQualities(qualities)
return seq, match
}