mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
First attempt for obiconsensus... The graph traversing algorithm is too simple
Former-commit-id: 0456e6c7fd55d6d0fcf9856c40386b976b912cba
This commit is contained in:
376
pkg/obikmer/debruijn.go
Normal file
376
pkg/obikmer/debruijn.go
Normal file
@ -0,0 +1,376 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
type KmerIdx32 uint32
|
||||
type KmerIdx64 uint64
|
||||
type KmerIdx128 struct {
|
||||
Lo uint64
|
||||
Hi uint64
|
||||
}
|
||||
|
||||
var iupac = map[byte][]uint64{
|
||||
'a': {0},
|
||||
'c': {1},
|
||||
'g': {2},
|
||||
't': {3},
|
||||
'u': {3},
|
||||
'r': {0, 2},
|
||||
'y': {1, 3},
|
||||
's': {1, 2},
|
||||
'w': {0, 3},
|
||||
'k': {2, 3},
|
||||
'm': {0, 1},
|
||||
'b': {1, 2, 3},
|
||||
'd': {0, 2, 3},
|
||||
'h': {0, 1, 3},
|
||||
'v': {0, 1, 2},
|
||||
'n': {0, 1, 2, 3},
|
||||
}
|
||||
|
||||
var decode = map[uint64]byte{
|
||||
0: 'a',
|
||||
1: 'c',
|
||||
2: 'g',
|
||||
3: 't',
|
||||
}
|
||||
|
||||
type KmerIdx_t interface {
|
||||
KmerIdx32 | KmerIdx64 | KmerIdx128
|
||||
}
|
||||
|
||||
type DeBruijnGraph struct {
|
||||
kmersize int
|
||||
kmermask uint64
|
||||
prevc uint64
|
||||
prevg uint64
|
||||
prevt uint64
|
||||
graph map[uint64]uint
|
||||
}
|
||||
|
||||
func MakeDeBruijnGraph(kmersize int) *DeBruijnGraph {
|
||||
g := DeBruijnGraph{
|
||||
kmersize: kmersize,
|
||||
kmermask: ^(^uint64(0) << (uint64(kmersize+1) * 2)),
|
||||
prevc: uint64(1) << (uint64(kmersize) * 2),
|
||||
prevg: uint64(2) << (uint64(kmersize) * 2),
|
||||
prevt: uint64(3) << (uint64(kmersize) * 2),
|
||||
graph: make(map[uint64]uint),
|
||||
}
|
||||
|
||||
return &g
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) KmerSize() int {
|
||||
return g.kmersize
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) Len() int {
|
||||
return len(g.graph)
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) MaxLink() int {
|
||||
max := uint(0)
|
||||
for _, count := range g.graph {
|
||||
if count > max {
|
||||
max = count
|
||||
}
|
||||
}
|
||||
|
||||
return int(max)
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) LinkSpectrum() []int {
|
||||
max := g.MaxLink()
|
||||
spectrum := make([]int, max+1)
|
||||
for _, count := range g.graph {
|
||||
spectrum[int(count)]++
|
||||
}
|
||||
|
||||
return spectrum
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) FilterMin(min int) {
|
||||
umin := uint(min)
|
||||
for idx, count := range g.graph {
|
||||
if count < umin {
|
||||
delete(g.graph, idx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) Previouses(index uint64) []uint64 {
|
||||
rep := make([]uint64, 0, 4)
|
||||
index = index >> 2
|
||||
|
||||
if _, ok := g.graph[index]; ok {
|
||||
rep = append(rep, index)
|
||||
}
|
||||
|
||||
key := index | g.prevc
|
||||
if _, ok := g.graph[key]; ok {
|
||||
rep = append(rep, key)
|
||||
}
|
||||
|
||||
key = index | g.prevg
|
||||
if _, ok := g.graph[key]; ok {
|
||||
rep = append(rep, key)
|
||||
}
|
||||
|
||||
key = index | g.prevt
|
||||
if _, ok := g.graph[key]; ok {
|
||||
rep = append(rep, key)
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) Nexts(index uint64) []uint64 {
|
||||
rep := make([]uint64, 0, 4)
|
||||
index = index << 2 & g.kmermask
|
||||
|
||||
if _, ok := g.graph[index]; ok {
|
||||
rep = append(rep, index)
|
||||
}
|
||||
|
||||
key := index | 1
|
||||
if _, ok := g.graph[key]; ok {
|
||||
rep = append(rep, key)
|
||||
}
|
||||
|
||||
key = index | 2
|
||||
if _, ok := g.graph[key]; ok {
|
||||
rep = append(rep, key)
|
||||
}
|
||||
|
||||
key = index | 3
|
||||
if _, ok := g.graph[key]; ok {
|
||||
rep = append(rep, key)
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) {
|
||||
ns := g.Nexts(index)
|
||||
|
||||
if len(ns) == 0 {
|
||||
return uint64(0), false
|
||||
}
|
||||
|
||||
max := uint(0)
|
||||
rep := uint64(0)
|
||||
for _, idx := range ns {
|
||||
w, _ := g.graph[idx]
|
||||
if w > max {
|
||||
rep = idx
|
||||
}
|
||||
}
|
||||
|
||||
return rep, true
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) MaxPath() []uint64 {
|
||||
path := make([]uint64, 0, 1000)
|
||||
ok := false
|
||||
idx := uint64(0)
|
||||
|
||||
idx, ok = g.MaxHead()
|
||||
|
||||
for ok {
|
||||
path = append(path, idx)
|
||||
idx, ok = g.MaxNext(idx)
|
||||
}
|
||||
|
||||
return path
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) LongestPath() []uint64 {
|
||||
var path []uint64
|
||||
wmax := uint(0)
|
||||
ok := true
|
||||
|
||||
starts:= g.Heads()
|
||||
|
||||
for _,idx := range starts {
|
||||
lp := make([]uint64, 0, 1000)
|
||||
w := uint(0)
|
||||
for ok {
|
||||
nw:= g.graph[idx]
|
||||
w+=nw
|
||||
lp = append(lp, idx)
|
||||
idx, ok = g.MaxNext(idx)
|
||||
}
|
||||
|
||||
if w > wmax {
|
||||
path=lp
|
||||
wmax=w
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return path
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence,error) {
|
||||
path := g.LongestPath()
|
||||
s := g.DecodePath(path)
|
||||
|
||||
if len(s) > 0 {
|
||||
seq := obiseq.MakeBioSequence(
|
||||
id,
|
||||
[]byte(s),
|
||||
"",
|
||||
)
|
||||
|
||||
return &seq,nil
|
||||
}
|
||||
|
||||
return nil,fmt.Errorf("cannot identify optimum path")
|
||||
}
|
||||
|
||||
|
||||
func (g *DeBruijnGraph) Heads() []uint64 {
|
||||
rep := make([]uint64, 0, 10)
|
||||
|
||||
for k := range g.graph {
|
||||
if len(g.Previouses(k)) == 0 {
|
||||
rep = append(rep, k)
|
||||
}
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) MaxHead() (uint64, bool) {
|
||||
rep := uint64(0)
|
||||
max := uint(0)
|
||||
found := false
|
||||
for k, w := range g.graph {
|
||||
if len(g.Previouses(k)) == 0 && w > max {
|
||||
rep = k
|
||||
found = true
|
||||
}
|
||||
}
|
||||
|
||||
return rep, found
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) DecodeNode(index uint64) string {
|
||||
rep := make([]byte, g.kmersize)
|
||||
index >>= 2
|
||||
for i := g.kmersize - 1; i >= 0; i-- {
|
||||
rep[i], _ = decode[index&3]
|
||||
index >>= 2
|
||||
}
|
||||
|
||||
return string(rep)
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) DecodePath(path []uint64) string {
|
||||
rep := make([]byte, 0, len(path)+g.kmersize)
|
||||
buf := bytes.NewBuffer(rep)
|
||||
|
||||
if len(path) > 0 {
|
||||
buf.WriteString(g.DecodeNode(path[0]))
|
||||
|
||||
for _, idx := range path[1:] {
|
||||
buf.WriteByte(decode[idx&3])
|
||||
}
|
||||
}
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) BestConsensus(id string) (*obiseq.BioSequence,error) {
|
||||
path := g.MaxPath()
|
||||
s := g.DecodePath(path)
|
||||
|
||||
if len(s) > 0 {
|
||||
seq := obiseq.MakeBioSequence(
|
||||
id,
|
||||
[]byte(s),
|
||||
"",
|
||||
)
|
||||
|
||||
return &seq,nil
|
||||
}
|
||||
|
||||
return nil,fmt.Errorf("cannot identify optimum path")
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) Weight(index uint64) int {
|
||||
val, ok := g.graph[index]
|
||||
if !ok {
|
||||
val = 0
|
||||
}
|
||||
return int(val)
|
||||
}
|
||||
|
||||
func (graph *DeBruijnGraph) append(sequence []byte, current uint64) {
|
||||
|
||||
for i := 0; i < len(sequence); i++ {
|
||||
current <<= 2
|
||||
current &= graph.kmermask
|
||||
b := iupac[sequence[i]]
|
||||
if len(b) == 1 {
|
||||
current |= b[0]
|
||||
|
||||
weight, ok := graph.graph[current]
|
||||
if !ok {
|
||||
weight = 0
|
||||
}
|
||||
graph.graph[current] = weight + 1
|
||||
} else {
|
||||
for j := 0; j < len(b); j++ {
|
||||
current &= ^uint64(3)
|
||||
current |= b[j]
|
||||
|
||||
weight, ok := graph.graph[current]
|
||||
if !ok {
|
||||
weight = 0
|
||||
}
|
||||
graph.graph[current] = weight + 1
|
||||
graph.append(sequence[(i+1):], current)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) {
|
||||
key := uint64(0)
|
||||
s := sequence.Sequence()
|
||||
init := make([]uint64, 0, 16)
|
||||
var f func(start int, key uint64)
|
||||
f = func(start int, key uint64) {
|
||||
for i := start; i < graph.kmersize; i++ {
|
||||
key <<= 2
|
||||
b := iupac[s[i]]
|
||||
if len(b) == 1 {
|
||||
key |= b[0]
|
||||
} else {
|
||||
for j := 0; j < len(b); j++ {
|
||||
key &= ^uint64(3)
|
||||
key |= b[j]
|
||||
f(i+1, key)
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
init = append(init, key&graph.kmermask)
|
||||
}
|
||||
|
||||
f(0, key)
|
||||
|
||||
for _, idx := range init {
|
||||
graph.append(s[graph.kmersize:], idx)
|
||||
}
|
||||
|
||||
}
|
Reference in New Issue
Block a user