Files
obitools4/pkg/obikmer/debruijn.go
Eric Coissac a33e471b39 First attempt for obiconsensus... The graph traversing algorithm is too simple
Former-commit-id: 0456e6c7fd55d6d0fcf9856c40386b976b912cba
2023-03-27 19:51:10 +07:00

377 lines
6.1 KiB
Go

package obikmer
import (
"bytes"
"fmt"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
type KmerIdx32 uint32
type KmerIdx64 uint64
type KmerIdx128 struct {
Lo uint64
Hi uint64
}
var iupac = map[byte][]uint64{
'a': {0},
'c': {1},
'g': {2},
't': {3},
'u': {3},
'r': {0, 2},
'y': {1, 3},
's': {1, 2},
'w': {0, 3},
'k': {2, 3},
'm': {0, 1},
'b': {1, 2, 3},
'd': {0, 2, 3},
'h': {0, 1, 3},
'v': {0, 1, 2},
'n': {0, 1, 2, 3},
}
var decode = map[uint64]byte{
0: 'a',
1: 'c',
2: 'g',
3: 't',
}
type KmerIdx_t interface {
KmerIdx32 | KmerIdx64 | KmerIdx128
}
type DeBruijnGraph struct {
kmersize int
kmermask uint64
prevc uint64
prevg uint64
prevt uint64
graph map[uint64]uint
}
func MakeDeBruijnGraph(kmersize int) *DeBruijnGraph {
g := DeBruijnGraph{
kmersize: kmersize,
kmermask: ^(^uint64(0) << (uint64(kmersize+1) * 2)),
prevc: uint64(1) << (uint64(kmersize) * 2),
prevg: uint64(2) << (uint64(kmersize) * 2),
prevt: uint64(3) << (uint64(kmersize) * 2),
graph: make(map[uint64]uint),
}
return &g
}
func (g *DeBruijnGraph) KmerSize() int {
return g.kmersize
}
func (g *DeBruijnGraph) Len() int {
return len(g.graph)
}
func (g *DeBruijnGraph) MaxLink() int {
max := uint(0)
for _, count := range g.graph {
if count > max {
max = count
}
}
return int(max)
}
func (g *DeBruijnGraph) LinkSpectrum() []int {
max := g.MaxLink()
spectrum := make([]int, max+1)
for _, count := range g.graph {
spectrum[int(count)]++
}
return spectrum
}
func (g *DeBruijnGraph) FilterMin(min int) {
umin := uint(min)
for idx, count := range g.graph {
if count < umin {
delete(g.graph, idx)
}
}
}
func (g *DeBruijnGraph) Previouses(index uint64) []uint64 {
rep := make([]uint64, 0, 4)
index = index >> 2
if _, ok := g.graph[index]; ok {
rep = append(rep, index)
}
key := index | g.prevc
if _, ok := g.graph[key]; ok {
rep = append(rep, key)
}
key = index | g.prevg
if _, ok := g.graph[key]; ok {
rep = append(rep, key)
}
key = index | g.prevt
if _, ok := g.graph[key]; ok {
rep = append(rep, key)
}
return rep
}
func (g *DeBruijnGraph) Nexts(index uint64) []uint64 {
rep := make([]uint64, 0, 4)
index = index << 2 & g.kmermask
if _, ok := g.graph[index]; ok {
rep = append(rep, index)
}
key := index | 1
if _, ok := g.graph[key]; ok {
rep = append(rep, key)
}
key = index | 2
if _, ok := g.graph[key]; ok {
rep = append(rep, key)
}
key = index | 3
if _, ok := g.graph[key]; ok {
rep = append(rep, key)
}
return rep
}
func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) {
ns := g.Nexts(index)
if len(ns) == 0 {
return uint64(0), false
}
max := uint(0)
rep := uint64(0)
for _, idx := range ns {
w, _ := g.graph[idx]
if w > max {
rep = idx
}
}
return rep, true
}
func (g *DeBruijnGraph) MaxPath() []uint64 {
path := make([]uint64, 0, 1000)
ok := false
idx := uint64(0)
idx, ok = g.MaxHead()
for ok {
path = append(path, idx)
idx, ok = g.MaxNext(idx)
}
return path
}
func (g *DeBruijnGraph) LongestPath() []uint64 {
var path []uint64
wmax := uint(0)
ok := true
starts:= g.Heads()
for _,idx := range starts {
lp := make([]uint64, 0, 1000)
w := uint(0)
for ok {
nw:= g.graph[idx]
w+=nw
lp = append(lp, idx)
idx, ok = g.MaxNext(idx)
}
if w > wmax {
path=lp
wmax=w
}
}
return path
}
func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence,error) {
path := g.LongestPath()
s := g.DecodePath(path)
if len(s) > 0 {
seq := obiseq.MakeBioSequence(
id,
[]byte(s),
"",
)
return &seq,nil
}
return nil,fmt.Errorf("cannot identify optimum path")
}
func (g *DeBruijnGraph) Heads() []uint64 {
rep := make([]uint64, 0, 10)
for k := range g.graph {
if len(g.Previouses(k)) == 0 {
rep = append(rep, k)
}
}
return rep
}
func (g *DeBruijnGraph) MaxHead() (uint64, bool) {
rep := uint64(0)
max := uint(0)
found := false
for k, w := range g.graph {
if len(g.Previouses(k)) == 0 && w > max {
rep = k
found = true
}
}
return rep, found
}
func (g *DeBruijnGraph) DecodeNode(index uint64) string {
rep := make([]byte, g.kmersize)
index >>= 2
for i := g.kmersize - 1; i >= 0; i-- {
rep[i], _ = decode[index&3]
index >>= 2
}
return string(rep)
}
func (g *DeBruijnGraph) DecodePath(path []uint64) string {
rep := make([]byte, 0, len(path)+g.kmersize)
buf := bytes.NewBuffer(rep)
if len(path) > 0 {
buf.WriteString(g.DecodeNode(path[0]))
for _, idx := range path[1:] {
buf.WriteByte(decode[idx&3])
}
}
return buf.String()
}
func (g *DeBruijnGraph) BestConsensus(id string) (*obiseq.BioSequence,error) {
path := g.MaxPath()
s := g.DecodePath(path)
if len(s) > 0 {
seq := obiseq.MakeBioSequence(
id,
[]byte(s),
"",
)
return &seq,nil
}
return nil,fmt.Errorf("cannot identify optimum path")
}
func (g *DeBruijnGraph) Weight(index uint64) int {
val, ok := g.graph[index]
if !ok {
val = 0
}
return int(val)
}
func (graph *DeBruijnGraph) append(sequence []byte, current uint64) {
for i := 0; i < len(sequence); i++ {
current <<= 2
current &= graph.kmermask
b := iupac[sequence[i]]
if len(b) == 1 {
current |= b[0]
weight, ok := graph.graph[current]
if !ok {
weight = 0
}
graph.graph[current] = weight + 1
} else {
for j := 0; j < len(b); j++ {
current &= ^uint64(3)
current |= b[j]
weight, ok := graph.graph[current]
if !ok {
weight = 0
}
graph.graph[current] = weight + 1
graph.append(sequence[(i+1):], current)
}
return
}
}
}
func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) {
key := uint64(0)
s := sequence.Sequence()
init := make([]uint64, 0, 16)
var f func(start int, key uint64)
f = func(start int, key uint64) {
for i := start; i < graph.kmersize; i++ {
key <<= 2
b := iupac[s[i]]
if len(b) == 1 {
key |= b[0]
} else {
for j := 0; j < len(b); j++ {
key &= ^uint64(3)
key |= b[j]
f(i+1, key)
}
return
}
}
init = append(init, key&graph.kmermask)
}
f(0, key)
for _, idx := range init {
graph.append(s[graph.kmersize:], idx)
}
}