mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Add a first version of obitag the successor of ecotag
This commit is contained in:
76
pkg/obikmer/counting.go
Normal file
76
pkg/obikmer/counting.go
Normal file
@ -0,0 +1,76 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
type Table4mer [256]uint16
|
||||
|
||||
func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer {
|
||||
|
||||
iternal_buffer := Encode4mer(seq, buffer)
|
||||
|
||||
if counts == nil {
|
||||
var w Table4mer
|
||||
counts = &w
|
||||
}
|
||||
|
||||
// Every cells of the counter is set to zero
|
||||
for i := 0; i < 256; i++ {
|
||||
(*counts)[i] = 0
|
||||
}
|
||||
|
||||
for _, code := range iternal_buffer {
|
||||
(*counts)[code]++
|
||||
}
|
||||
return counts
|
||||
}
|
||||
|
||||
func Common4Mer(count1, count2 *Table4mer) int {
|
||||
sum := 0
|
||||
for i := 0; i < 256; i++ {
|
||||
sum += int(goutils.MinUInt16((*count1)[i], (*count2)[i]))
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
func Sum4Mer(count *Table4mer) int {
|
||||
sum := 0
|
||||
for i := 0; i < 256; i++ {
|
||||
sum += int((*count)[i])
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
func LCS4MerBounds(count1, count2 *Table4mer) (int, int) {
|
||||
s1 := Sum4Mer(count1)
|
||||
s2 := Sum4Mer(count2)
|
||||
smin := goutils.MinInt(s1, s2)
|
||||
|
||||
cw := Common4Mer(count1, count2)
|
||||
|
||||
lcsMax := smin + 3 - int(math.Ceil(float64(smin-cw)/4.0))
|
||||
lcsMin := cw
|
||||
|
||||
if cw > 0 {
|
||||
lcsMin += 3
|
||||
}
|
||||
|
||||
return lcsMin, lcsMax
|
||||
}
|
||||
|
||||
func Error4MerBounds(count1, count2 *Table4mer) (int, int) {
|
||||
s1 := Sum4Mer(count1)
|
||||
s2 := Sum4Mer(count2)
|
||||
smax := goutils.MaxInt(s1, s2)
|
||||
|
||||
cw := Common4Mer(count1, count2)
|
||||
|
||||
errorMax := smax - cw + 2* int(math.Floor(float64(cw+5)/8.0))
|
||||
errorMin := int(math.Ceil(float64(errorMax) / 4.0))
|
||||
|
||||
return errorMin, errorMax
|
||||
}
|
@ -21,7 +21,7 @@ var __single_base_code__ = []byte{0,
|
||||
}
|
||||
|
||||
// Encode4mer transforms an obiseq.BioSequence into a sequence
|
||||
// of kmer of length 4. Each letter of the sequence noot belonging
|
||||
// of kmer of length 4. Each letter of the sequence not belonging
|
||||
// A, C, G, T, U are considered as a A. The kmer is encoded as a byte
|
||||
// value ranging from 0 to 255. Each nucleotite is represented by
|
||||
// two bits. The values 0, 1, 2, 3 correspond respectively to A, C, G,
|
||||
@ -65,15 +65,24 @@ func Encode4mer(seq *obiseq.BioSequence, buffer *[]byte) []byte {
|
||||
return *buffer
|
||||
}
|
||||
|
||||
// Index4mer returns an index where the occurrence position of every fourmer is
|
||||
// stored. The index is returned as an array of slices of integer. The first
|
||||
// dimention corresponds to the code of the 4mer, the second
|
||||
func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int {
|
||||
|
||||
iternal_buffer := Encode4mer(seq, buffer)
|
||||
|
||||
if index == nil || cap(*index) < 256 {
|
||||
// A new index is created
|
||||
i := make([][]int, 256)
|
||||
index = &i
|
||||
if index == nil {
|
||||
index = &i
|
||||
} else {
|
||||
*index = i
|
||||
}
|
||||
}
|
||||
|
||||
// Every cells of the index is emptied
|
||||
for i := 0; i < 256; i++ {
|
||||
(*index)[i] = (*index)[i][:0]
|
||||
}
|
||||
@ -85,6 +94,9 @@ func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int
|
||||
return *index
|
||||
}
|
||||
|
||||
// FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences.
|
||||
// The returned values are two integer values. The shift between both the sequences and the count of
|
||||
// matching 4mer when this shift is applied between both the sequences.
|
||||
func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (int, int) {
|
||||
|
||||
iternal_buffer := Encode4mer(seq, buffer)
|
||||
@ -115,3 +127,4 @@ func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (i
|
||||
|
||||
return maxshift, maxcount
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user