2023-03-27 19:51:10 +07:00
|
|
|
package obikmer
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"fmt"
|
2023-03-27 22:43:45 +07:00
|
|
|
"log"
|
2023-03-28 11:43:46 +07:00
|
|
|
"math"
|
2023-03-27 19:51:10 +07:00
|
|
|
|
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
|
|
|
)
|
|
|
|
|
|
|
|
type KmerIdx32 uint32
|
|
|
|
type KmerIdx64 uint64
|
|
|
|
type KmerIdx128 struct {
|
|
|
|
Lo uint64
|
|
|
|
Hi uint64
|
|
|
|
}
|
|
|
|
|
|
|
|
var iupac = map[byte][]uint64{
|
|
|
|
'a': {0},
|
|
|
|
'c': {1},
|
|
|
|
'g': {2},
|
|
|
|
't': {3},
|
|
|
|
'u': {3},
|
|
|
|
'r': {0, 2},
|
|
|
|
'y': {1, 3},
|
|
|
|
's': {1, 2},
|
|
|
|
'w': {0, 3},
|
|
|
|
'k': {2, 3},
|
|
|
|
'm': {0, 1},
|
|
|
|
'b': {1, 2, 3},
|
|
|
|
'd': {0, 2, 3},
|
|
|
|
'h': {0, 1, 3},
|
|
|
|
'v': {0, 1, 2},
|
|
|
|
'n': {0, 1, 2, 3},
|
|
|
|
}
|
|
|
|
|
|
|
|
var decode = map[uint64]byte{
|
|
|
|
0: 'a',
|
|
|
|
1: 'c',
|
|
|
|
2: 'g',
|
|
|
|
3: 't',
|
|
|
|
}
|
|
|
|
|
|
|
|
type KmerIdx_t interface {
|
|
|
|
KmerIdx32 | KmerIdx64 | KmerIdx128
|
|
|
|
}
|
|
|
|
|
|
|
|
type DeBruijnGraph struct {
|
|
|
|
kmersize int
|
|
|
|
kmermask uint64
|
|
|
|
prevc uint64
|
|
|
|
prevg uint64
|
|
|
|
prevt uint64
|
|
|
|
graph map[uint64]uint
|
|
|
|
}
|
|
|
|
|
|
|
|
func MakeDeBruijnGraph(kmersize int) *DeBruijnGraph {
|
|
|
|
g := DeBruijnGraph{
|
|
|
|
kmersize: kmersize,
|
|
|
|
kmermask: ^(^uint64(0) << (uint64(kmersize+1) * 2)),
|
|
|
|
prevc: uint64(1) << (uint64(kmersize) * 2),
|
|
|
|
prevg: uint64(2) << (uint64(kmersize) * 2),
|
|
|
|
prevt: uint64(3) << (uint64(kmersize) * 2),
|
|
|
|
graph: make(map[uint64]uint),
|
|
|
|
}
|
|
|
|
|
|
|
|
return &g
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) KmerSize() int {
|
|
|
|
return g.kmersize
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) Len() int {
|
|
|
|
return len(g.graph)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) MaxLink() int {
|
|
|
|
max := uint(0)
|
|
|
|
for _, count := range g.graph {
|
|
|
|
if count > max {
|
|
|
|
max = count
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return int(max)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) LinkSpectrum() []int {
|
|
|
|
max := g.MaxLink()
|
|
|
|
spectrum := make([]int, max+1)
|
|
|
|
for _, count := range g.graph {
|
|
|
|
spectrum[int(count)]++
|
|
|
|
}
|
|
|
|
|
|
|
|
return spectrum
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) FilterMin(min int) {
|
|
|
|
umin := uint(min)
|
|
|
|
for idx, count := range g.graph {
|
|
|
|
if count < umin {
|
|
|
|
delete(g.graph, idx)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) Previouses(index uint64) []uint64 {
|
|
|
|
rep := make([]uint64, 0, 4)
|
|
|
|
index = index >> 2
|
|
|
|
|
|
|
|
if _, ok := g.graph[index]; ok {
|
|
|
|
rep = append(rep, index)
|
|
|
|
}
|
|
|
|
|
|
|
|
key := index | g.prevc
|
|
|
|
if _, ok := g.graph[key]; ok {
|
|
|
|
rep = append(rep, key)
|
|
|
|
}
|
|
|
|
|
|
|
|
key = index | g.prevg
|
|
|
|
if _, ok := g.graph[key]; ok {
|
|
|
|
rep = append(rep, key)
|
|
|
|
}
|
|
|
|
|
|
|
|
key = index | g.prevt
|
|
|
|
if _, ok := g.graph[key]; ok {
|
|
|
|
rep = append(rep, key)
|
|
|
|
}
|
|
|
|
|
|
|
|
return rep
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) Nexts(index uint64) []uint64 {
|
|
|
|
rep := make([]uint64, 0, 4)
|
|
|
|
index = index << 2 & g.kmermask
|
|
|
|
|
|
|
|
if _, ok := g.graph[index]; ok {
|
|
|
|
rep = append(rep, index)
|
|
|
|
}
|
|
|
|
|
|
|
|
key := index | 1
|
|
|
|
if _, ok := g.graph[key]; ok {
|
|
|
|
rep = append(rep, key)
|
|
|
|
}
|
|
|
|
|
|
|
|
key = index | 2
|
|
|
|
if _, ok := g.graph[key]; ok {
|
|
|
|
rep = append(rep, key)
|
|
|
|
}
|
|
|
|
|
|
|
|
key = index | 3
|
|
|
|
if _, ok := g.graph[key]; ok {
|
|
|
|
rep = append(rep, key)
|
|
|
|
}
|
|
|
|
|
|
|
|
return rep
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) {
|
|
|
|
ns := g.Nexts(index)
|
|
|
|
|
|
|
|
if len(ns) == 0 {
|
|
|
|
return uint64(0), false
|
|
|
|
}
|
|
|
|
|
|
|
|
max := uint(0)
|
|
|
|
rep := uint64(0)
|
|
|
|
for _, idx := range ns {
|
2023-03-27 22:43:45 +07:00
|
|
|
w := g.graph[idx]
|
2023-03-27 19:51:10 +07:00
|
|
|
if w > max {
|
|
|
|
rep = idx
|
2023-03-27 22:43:45 +07:00
|
|
|
max = w
|
2023-03-27 19:51:10 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return rep, true
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) MaxPath() []uint64 {
|
|
|
|
path := make([]uint64, 0, 1000)
|
|
|
|
ok := false
|
|
|
|
idx := uint64(0)
|
|
|
|
|
|
|
|
idx, ok = g.MaxHead()
|
|
|
|
|
|
|
|
for ok {
|
|
|
|
path = append(path, idx)
|
|
|
|
idx, ok = g.MaxNext(idx)
|
|
|
|
}
|
|
|
|
|
|
|
|
return path
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) LongestPath() []uint64 {
|
2023-03-27 22:43:45 +07:00
|
|
|
var path []uint64
|
2023-03-27 19:51:10 +07:00
|
|
|
wmax := uint(0)
|
|
|
|
ok := true
|
|
|
|
|
2023-03-27 22:43:45 +07:00
|
|
|
starts := g.Heads()
|
|
|
|
log.Println(starts)
|
|
|
|
for _, idx := range starts {
|
2023-03-27 19:51:10 +07:00
|
|
|
lp := make([]uint64, 0, 1000)
|
2023-03-27 22:43:45 +07:00
|
|
|
ok = true
|
2023-03-27 19:51:10 +07:00
|
|
|
w := uint(0)
|
|
|
|
for ok {
|
2023-03-27 22:43:45 +07:00
|
|
|
nw := g.graph[idx]
|
|
|
|
w += nw
|
2023-03-27 19:51:10 +07:00
|
|
|
lp = append(lp, idx)
|
|
|
|
idx, ok = g.MaxNext(idx)
|
|
|
|
}
|
2023-03-27 22:43:45 +07:00
|
|
|
|
|
|
|
log.Printf("max : %d current %d", wmax, w)
|
2023-03-27 19:51:10 +07:00
|
|
|
if w > wmax {
|
2023-03-27 22:43:45 +07:00
|
|
|
path = lp
|
|
|
|
wmax = w
|
2023-03-27 19:51:10 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return path
|
|
|
|
}
|
|
|
|
|
2023-03-27 22:43:45 +07:00
|
|
|
func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error) {
|
2023-03-27 19:51:10 +07:00
|
|
|
path := g.LongestPath()
|
|
|
|
s := g.DecodePath(path)
|
|
|
|
|
|
|
|
if len(s) > 0 {
|
|
|
|
seq := obiseq.MakeBioSequence(
|
|
|
|
id,
|
|
|
|
[]byte(s),
|
|
|
|
"",
|
|
|
|
)
|
|
|
|
|
2023-03-27 22:43:45 +07:00
|
|
|
return &seq, nil
|
2023-03-27 19:51:10 +07:00
|
|
|
}
|
|
|
|
|
2023-03-27 22:43:45 +07:00
|
|
|
return nil, fmt.Errorf("cannot identify optimum path")
|
2023-03-27 19:51:10 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) Heads() []uint64 {
|
|
|
|
rep := make([]uint64, 0, 10)
|
|
|
|
|
|
|
|
for k := range g.graph {
|
|
|
|
if len(g.Previouses(k)) == 0 {
|
|
|
|
rep = append(rep, k)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return rep
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) MaxHead() (uint64, bool) {
|
|
|
|
rep := uint64(0)
|
|
|
|
max := uint(0)
|
|
|
|
found := false
|
|
|
|
for k, w := range g.graph {
|
|
|
|
if len(g.Previouses(k)) == 0 && w > max {
|
|
|
|
rep = k
|
|
|
|
found = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return rep, found
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) DecodeNode(index uint64) string {
|
|
|
|
rep := make([]byte, g.kmersize)
|
|
|
|
index >>= 2
|
|
|
|
for i := g.kmersize - 1; i >= 0; i-- {
|
|
|
|
rep[i], _ = decode[index&3]
|
|
|
|
index >>= 2
|
|
|
|
}
|
|
|
|
|
|
|
|
return string(rep)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) DecodePath(path []uint64) string {
|
|
|
|
rep := make([]byte, 0, len(path)+g.kmersize)
|
|
|
|
buf := bytes.NewBuffer(rep)
|
|
|
|
|
|
|
|
if len(path) > 0 {
|
|
|
|
buf.WriteString(g.DecodeNode(path[0]))
|
|
|
|
|
|
|
|
for _, idx := range path[1:] {
|
|
|
|
buf.WriteByte(decode[idx&3])
|
2023-03-27 22:43:45 +07:00
|
|
|
}
|
2023-03-27 19:51:10 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
return buf.String()
|
|
|
|
}
|
|
|
|
|
2023-03-27 22:43:45 +07:00
|
|
|
func (g *DeBruijnGraph) BestConsensus(id string) (*obiseq.BioSequence, error) {
|
2023-03-27 19:51:10 +07:00
|
|
|
path := g.MaxPath()
|
|
|
|
s := g.DecodePath(path)
|
|
|
|
|
|
|
|
if len(s) > 0 {
|
|
|
|
seq := obiseq.MakeBioSequence(
|
|
|
|
id,
|
|
|
|
[]byte(s),
|
|
|
|
"",
|
|
|
|
)
|
|
|
|
|
2023-03-27 22:43:45 +07:00
|
|
|
return &seq, nil
|
2023-03-27 19:51:10 +07:00
|
|
|
}
|
|
|
|
|
2023-03-27 22:43:45 +07:00
|
|
|
return nil, fmt.Errorf("cannot identify optimum path")
|
2023-03-27 19:51:10 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
func (g *DeBruijnGraph) Weight(index uint64) int {
|
|
|
|
val, ok := g.graph[index]
|
|
|
|
if !ok {
|
|
|
|
val = 0
|
|
|
|
}
|
|
|
|
return int(val)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (graph *DeBruijnGraph) append(sequence []byte, current uint64) {
|
|
|
|
|
|
|
|
for i := 0; i < len(sequence); i++ {
|
|
|
|
current <<= 2
|
|
|
|
current &= graph.kmermask
|
|
|
|
b := iupac[sequence[i]]
|
|
|
|
if len(b) == 1 {
|
|
|
|
current |= b[0]
|
|
|
|
|
|
|
|
weight, ok := graph.graph[current]
|
|
|
|
if !ok {
|
|
|
|
weight = 0
|
|
|
|
}
|
|
|
|
graph.graph[current] = weight + 1
|
|
|
|
} else {
|
|
|
|
for j := 0; j < len(b); j++ {
|
|
|
|
current &= ^uint64(3)
|
|
|
|
current |= b[j]
|
|
|
|
|
|
|
|
weight, ok := graph.graph[current]
|
|
|
|
if !ok {
|
|
|
|
weight = 0
|
|
|
|
}
|
|
|
|
graph.graph[current] = weight + 1
|
|
|
|
graph.append(sequence[(i+1):], current)
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) {
|
|
|
|
key := uint64(0)
|
|
|
|
s := sequence.Sequence()
|
|
|
|
init := make([]uint64, 0, 16)
|
|
|
|
var f func(start int, key uint64)
|
|
|
|
f = func(start int, key uint64) {
|
|
|
|
for i := start; i < graph.kmersize; i++ {
|
|
|
|
key <<= 2
|
|
|
|
b := iupac[s[i]]
|
|
|
|
if len(b) == 1 {
|
|
|
|
key |= b[0]
|
|
|
|
} else {
|
|
|
|
for j := 0; j < len(b); j++ {
|
|
|
|
key &= ^uint64(3)
|
|
|
|
key |= b[j]
|
|
|
|
f(i+1, key)
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
init = append(init, key&graph.kmermask)
|
|
|
|
}
|
|
|
|
|
|
|
|
f(0, key)
|
|
|
|
|
|
|
|
for _, idx := range init {
|
|
|
|
graph.append(s[graph.kmersize:], idx)
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2023-03-27 22:43:45 +07:00
|
|
|
|
2023-03-28 11:43:46 +07:00
|
|
|
func (graph *DeBruijnGraph) Gml() string {
|
2023-03-27 22:43:45 +07:00
|
|
|
buffer := bytes.NewBuffer(make([]byte, 0, 1000))
|
|
|
|
|
|
|
|
buffer.WriteString(
|
|
|
|
`graph [
|
|
|
|
comment "De Bruijn graph"
|
|
|
|
directed 1
|
|
|
|
|
|
|
|
`)
|
|
|
|
|
|
|
|
for idx := range graph.graph {
|
|
|
|
node := graph.DecodeNode(idx)
|
|
|
|
buffer.WriteString(
|
|
|
|
fmt.Sprintf("node [ id \"%s\" ]\n", node),
|
|
|
|
)
|
|
|
|
n := graph.Nexts(uint64(idx))
|
|
|
|
if len(n) == 0 {
|
|
|
|
idx <<= 2
|
|
|
|
idx &= graph.kmermask
|
|
|
|
node := graph.DecodeNode(idx)
|
|
|
|
buffer.WriteString(
|
|
|
|
fmt.Sprintf("node [ id \"%s\" \n label \"%s\" ]\n", node, node),
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for idx, weight := range graph.graph {
|
|
|
|
src := graph.DecodeNode(idx)
|
|
|
|
label := decode[idx&3]
|
|
|
|
idx <<= 2
|
|
|
|
idx &= graph.kmermask
|
|
|
|
dst := graph.DecodeNode(idx)
|
|
|
|
|
|
|
|
buffer.WriteString(
|
|
|
|
fmt.Sprintf(`edge [ source "%s"
|
|
|
|
target "%s"
|
|
|
|
color "#00FF00"
|
|
|
|
label "%c[%d]"
|
2023-03-28 11:43:46 +07:00
|
|
|
graphics [
|
|
|
|
width %f
|
|
|
|
arrow "last"
|
|
|
|
fill "#007F80"
|
|
|
|
]
|
2023-03-27 22:43:45 +07:00
|
|
|
]
|
|
|
|
|
2023-03-28 11:43:46 +07:00
|
|
|
`, src, dst, label, weight, math.Log(float64(weight))),
|
2023-03-27 22:43:45 +07:00
|
|
|
)
|
|
|
|
}
|
|
|
|
buffer.WriteString("]\n")
|
|
|
|
return buffer.String()
|
|
|
|
|
|
|
|
}
|