Add a first version of obitag the successor of ecotag

This commit is contained in:
2022-10-26 13:16:56 +02:00
parent e17d1fbca6
commit 8aa323dad5
17 changed files with 884 additions and 5 deletions

76
pkg/obikmer/counting.go Normal file
View File

@ -0,0 +1,76 @@
package obikmer
import (
"math"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
type Table4mer [256]uint16
func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer {
iternal_buffer := Encode4mer(seq, buffer)
if counts == nil {
var w Table4mer
counts = &w
}
// Every cells of the counter is set to zero
for i := 0; i < 256; i++ {
(*counts)[i] = 0
}
for _, code := range iternal_buffer {
(*counts)[code]++
}
return counts
}
func Common4Mer(count1, count2 *Table4mer) int {
sum := 0
for i := 0; i < 256; i++ {
sum += int(goutils.MinUInt16((*count1)[i], (*count2)[i]))
}
return sum
}
func Sum4Mer(count *Table4mer) int {
sum := 0
for i := 0; i < 256; i++ {
sum += int((*count)[i])
}
return sum
}
func LCS4MerBounds(count1, count2 *Table4mer) (int, int) {
s1 := Sum4Mer(count1)
s2 := Sum4Mer(count2)
smin := goutils.MinInt(s1, s2)
cw := Common4Mer(count1, count2)
lcsMax := smin + 3 - int(math.Ceil(float64(smin-cw)/4.0))
lcsMin := cw
if cw > 0 {
lcsMin += 3
}
return lcsMin, lcsMax
}
func Error4MerBounds(count1, count2 *Table4mer) (int, int) {
s1 := Sum4Mer(count1)
s2 := Sum4Mer(count2)
smax := goutils.MaxInt(s1, s2)
cw := Common4Mer(count1, count2)
errorMax := smax - cw + 2* int(math.Floor(float64(cw+5)/8.0))
errorMin := int(math.Ceil(float64(errorMax) / 4.0))
return errorMin, errorMax
}

View File

@ -21,7 +21,7 @@ var __single_base_code__ = []byte{0,
}
// Encode4mer transforms an obiseq.BioSequence into a sequence
// of kmer of length 4. Each letter of the sequence noot belonging
// of kmer of length 4. Each letter of the sequence not belonging
// A, C, G, T, U are considered as a A. The kmer is encoded as a byte
// value ranging from 0 to 255. Each nucleotite is represented by
// two bits. The values 0, 1, 2, 3 correspond respectively to A, C, G,
@ -65,15 +65,24 @@ func Encode4mer(seq *obiseq.BioSequence, buffer *[]byte) []byte {
return *buffer
}
// Index4mer returns an index where the occurrence position of every fourmer is
// stored. The index is returned as an array of slices of integer. The first
// dimention corresponds to the code of the 4mer, the second
func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int {
iternal_buffer := Encode4mer(seq, buffer)
if index == nil || cap(*index) < 256 {
// A new index is created
i := make([][]int, 256)
index = &i
if index == nil {
index = &i
} else {
*index = i
}
}
// Every cells of the index is emptied
for i := 0; i < 256; i++ {
(*index)[i] = (*index)[i][:0]
}
@ -85,6 +94,9 @@ func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int
return *index
}
// FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences.
// The returned values are two integer values. The shift between both the sequences and the count of
// matching 4mer when this shift is applied between both the sequences.
func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (int, int) {
iternal_buffer := Encode4mer(seq, buffer)
@ -115,3 +127,4 @@ func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (i
return maxshift, maxcount
}