Add obiminion first version

Former-commit-id: aa5ace7bd4d2266333715fca7094d1c3cbbb5e6d
This commit is contained in:
Eric Coissac
2024-05-14 08:16:12 +02:00
parent 9e63013bc2
commit 017030bcce
24 changed files with 1599 additions and 469 deletions

View File

@ -9,9 +9,10 @@ import (
type Table4mer [256]uint16
func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer {
iternal_buffer := Encode4mer(seq, buffer)
func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer {
iternal_buffer := Encode4mer(seq, buffer) // The slice of 4-mer codes
if counts == nil {
var w Table4mer
@ -19,7 +20,7 @@ func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Tabl
}
// Every cells of the counter is set to zero
for i := 0; i < 256; i++ {
for i := 0; i < 256; i++ { // 256 is the number of possible 4-mer codes
(*counts)[i] = 0
}
@ -32,7 +33,7 @@ func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Tabl
func Common4Mer(count1, count2 *Table4mer) int {
sum := 0
for i := 0; i < 256; i++ {
sum += int(obiutils.MinUInt16((*count1)[i], (*count2)[i]))
sum += int(obiutils.Min((*count1)[i], (*count2)[i]))
}
return sum
}
@ -48,7 +49,7 @@ func Sum4Mer(count *Table4mer) int {
func LCS4MerBounds(count1, count2 *Table4mer) (int, int) {
s1 := Sum4Mer(count1)
s2 := Sum4Mer(count2)
smin := obiutils.MinInt(s1, s2)
smin := obiutils.Min(s1, s2)
cw := Common4Mer(count1, count2)
@ -65,7 +66,7 @@ func LCS4MerBounds(count1, count2 *Table4mer) (int, int) {
func Error4MerBounds(count1, count2 *Table4mer) (int, int) {
s1 := Sum4Mer(count1)
s2 := Sum4Mer(count2)
smax := obiutils.MaxInt(s1, s2)
smax := obiutils.Max(s1, s2)
cw := Common4Mer(count1, count2)

View File

@ -2,13 +2,16 @@ package obikmer
import (
"bytes"
"container/heap"
"fmt"
"math"
"math/bits"
"slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/daichi-m/go18ds/sets/linkedhashset"
"github.com/daichi-m/go18ds/stacks/arraystack"
log "github.com/sirupsen/logrus"
)
type KmerIdx32 uint32
@ -49,36 +52,60 @@ type KmerIdx_t interface {
}
type DeBruijnGraph struct {
kmersize int
kmermask uint64
prevc uint64
kmersize int // k-mer size
kmermask uint64 // mask used to set to 0 the bits that are not in the k-mer
prevc uint64 //
prevg uint64
prevt uint64
graph map[uint64]uint
graph map[uint64]uint // Kmer are encoded as uint64 with 2 bits per character
}
// MakeDeBruijnGraph creates a De Bruijn Graph with the specified k-mer size.
//
// Parameters:
//
// kmersize int - the size of the k-mers
//
// Returns:
//
// *DeBruijnGraph - a pointer to the created De Bruijn's Graph
func MakeDeBruijnGraph(kmersize int) *DeBruijnGraph {
g := DeBruijnGraph{
kmersize: kmersize,
kmermask: ^(^uint64(0) << (uint64(kmersize+1) * 2)),
prevc: uint64(1) << (uint64(kmersize) * 2),
prevg: uint64(2) << (uint64(kmersize) * 2),
prevt: uint64(3) << (uint64(kmersize) * 2),
kmermask: ^(^uint64(0) << (uint64(kmersize) * 2)), // k-mer mask used to set to 0 the bits that are not in the k-mer
prevc: uint64(1) << (uint64(kmersize-1) * 2),
prevg: uint64(2) << (uint64(kmersize-1) * 2),
prevt: uint64(3) << (uint64(kmersize-1) * 2),
graph: make(map[uint64]uint),
}
return &g
}
// KmerSize returns the size of the k-mers in the DeBruijn graph.
//
// This function takes no parameters.
// It returns an integer representing the size of the k-mers.
func (g *DeBruijnGraph) KmerSize() int {
return g.kmersize
}
// Len returns the length of the graph.
//
// This function takes no parameters.
// It returns an integer representing the number of nodes in the graph.
func (g *DeBruijnGraph) Len() int {
return len(g.graph)
}
func (g *DeBruijnGraph) MaxLink() int {
// MaxWeight returns the maximum weight of a node from the DeBruijn's Graph.
//
// It iterates over each count in the graph map and updates the max value if the current count is greater.
// Finally, it returns the maximum weight as an integer.
//
// Returns:
// - int: the maximum weight value.
func (g *DeBruijnGraph) MaxWeight() int {
max := uint(0)
for _, count := range g.graph {
if count > max {
@ -89,8 +116,12 @@ func (g *DeBruijnGraph) MaxLink() int {
return int(max)
}
func (g *DeBruijnGraph) LinkSpectrum() []int {
max := g.MaxLink()
// WeightSpectrum calculates the weight spectrum of nodes in the DeBruijn's graph.
//
// No parameters.
// Returns an array of integers representing the weight spectrum.
func (g *DeBruijnGraph) WeightSpectrum() []int {
max := g.MaxWeight()
spectrum := make([]int, max+1)
for _, count := range g.graph {
spectrum[int(count)]++
@ -99,7 +130,10 @@ func (g *DeBruijnGraph) LinkSpectrum() []int {
return spectrum
}
func (g *DeBruijnGraph) FilterMin(min int) {
// FilterMinWeight filters the DeBruijnGraph by removing nodes with weight less than the specified minimum.
//
// min: an integer representing the minimum count threshold.
func (g *DeBruijnGraph) FilterMinWeight(min int) {
umin := uint(min)
for idx, count := range g.graph {
if count < umin {
@ -109,8 +143,12 @@ func (g *DeBruijnGraph) FilterMin(min int) {
}
func (g *DeBruijnGraph) Previouses(index uint64) []uint64 {
if _, ok := g.graph[index]; !ok {
log.Panicf("k-mer %s (index %d) is not in graph", g.DecodeNode(index), index)
}
rep := make([]uint64, 0, 4)
index = index >> 2
index >>= 2
if _, ok := g.graph[index]; ok {
rep = append(rep, index)
@ -135,6 +173,10 @@ func (g *DeBruijnGraph) Previouses(index uint64) []uint64 {
}
func (g *DeBruijnGraph) Nexts(index uint64) []uint64 {
if _, ok := g.graph[index]; !ok {
log.Panicf("k-mer %s (index %d) is not in graph", g.DecodeNode(index), index)
}
rep := make([]uint64, 0, 4)
index = (index << 2) & g.kmermask
@ -160,11 +202,11 @@ func (g *DeBruijnGraph) Nexts(index uint64) []uint64 {
return rep
}
func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) {
func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, int, bool) {
ns := g.Nexts(index)
if len(ns) == 0 {
return uint64(0), false
return uint64(0), 0, false
}
max := uint(0)
@ -177,7 +219,34 @@ func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) {
}
}
return rep, true
return rep, int(max), true
}
func (g *DeBruijnGraph) Heads() []uint64 {
rep := make([]uint64, 0, 10)
for k := range g.graph {
if len(g.Previouses(k)) == 0 {
rep = append(rep, k)
}
}
return rep
}
func (g *DeBruijnGraph) MaxHead() (uint64, int, bool) {
rep := uint64(0)
max := uint(0)
found := false
for k, w := range g.graph {
if len(g.Previouses(k)) == 0 && w > max {
rep = k
max = w
found = true
}
}
return rep, int(max), found
}
func (g *DeBruijnGraph) MaxPath() []uint64 {
@ -185,17 +254,17 @@ func (g *DeBruijnGraph) MaxPath() []uint64 {
ok := false
idx := uint64(0)
idx, ok = g.MaxHead()
idx, _, ok = g.MaxHead()
for ok {
path = append(path, idx)
idx, ok = g.MaxNext(idx)
idx, _, ok = g.MaxNext(idx)
}
return path
}
func (g *DeBruijnGraph) LongestPath() []uint64 {
func (g *DeBruijnGraph) LongestPath(max_length int) []uint64 {
var path []uint64
wmax := uint(0)
ok := true
@ -209,7 +278,11 @@ func (g *DeBruijnGraph) LongestPath() []uint64 {
nw := g.graph[idx]
w += nw
lp = append(lp, idx)
idx, ok = g.MaxNext(idx)
idx, _, ok = g.MaxNext(idx)
if max_length > 0 && len(lp) > max_length {
ok = false
w = 0
}
}
if w > wmax {
@ -221,8 +294,9 @@ func (g *DeBruijnGraph) LongestPath() []uint64 {
return path
}
func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error) {
path := g.LongestPath()
func (g *DeBruijnGraph) LongestConsensus(id string, max_length int) (*obiseq.BioSequence, error) {
//path := g.LongestPath(max_length)
path := g.HaviestPath()
s := g.DecodePath(path)
if len(s) > 0 {
@ -238,37 +312,10 @@ func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error)
return nil, fmt.Errorf("cannot identify optimum path")
}
func (g *DeBruijnGraph) Heads() []uint64 {
rep := make([]uint64, 0, 10)
for k := range g.graph {
if len(g.Previouses(k)) == 0 {
rep = append(rep, k)
}
}
return rep
}
func (g *DeBruijnGraph) MaxHead() (uint64, bool) {
rep := uint64(0)
max := uint(0)
found := false
for k, w := range g.graph {
if len(g.Previouses(k)) == 0 && w > max {
rep = k
found = true
}
}
return rep, found
}
func (g *DeBruijnGraph) DecodeNode(index uint64) string {
rep := make([]byte, g.kmersize)
index >>= 2
for i := g.kmersize - 1; i >= 0; i-- {
rep[i], _ = decode[index&3]
rep[i] = decode[index&3]
index >>= 2
}
@ -282,7 +329,7 @@ func (g *DeBruijnGraph) DecodePath(path []uint64) string {
if len(path) > 0 {
buf.WriteString(g.DecodeNode(path[0]))
for _, idx := range path {
for _, idx := range path[1:] {
buf.WriteByte(decode[idx&3])
}
}
@ -307,6 +354,13 @@ func (g *DeBruijnGraph) BestConsensus(id string) (*obiseq.BioSequence, error) {
return nil, fmt.Errorf("cannot identify optimum path")
}
// Weight returns the weight of the node at the given index in the DeBruijnGraph.
//
// Parameters:
// - index: the index of the node in the graph.
//
// Returns:
// - int: the weight of the node.
func (g *DeBruijnGraph) Weight(index uint64) int {
val, ok := g.graph[index]
if !ok {
@ -315,59 +369,64 @@ func (g *DeBruijnGraph) Weight(index uint64) int {
return int(val)
}
// append appends a sequence of nucleotides to the DeBruijnGraph.
//
// Parameters:
// - sequence: a byte slice representing the sequence of nucleotides to append.
// - current: the current node in the graph to which the sequence will be appended.
// - weight: the weight of the added nodes.
func (graph *DeBruijnGraph) append(sequence []byte, current uint64, weight int) {
for i := 0; i < len(sequence); i++ {
current <<= 2
current &= graph.kmermask
b := iupac[sequence[i]]
if len(b) == 1 {
current |= b[0]
graph.graph[current] = uint(graph.Weight(current) + weight)
} else {
for j := 0; j < len(b); j++ {
current &= ^uint64(3)
current |= b[j]
if len(sequence) == 0 {
return
}
graph.graph[current] = uint(graph.Weight(current) + weight)
graph.append(sequence[(i+1):], current, weight)
}
return
}
current <<= 2
current &= graph.kmermask
b := iupac[sequence[0]]
current |= b[0]
graph.graph[current] = uint(graph.Weight(current) + weight)
graph.append(sequence[1:], current, weight)
for j := 1; j < len(b); j++ {
current &= ^uint64(3)
current |= b[j]
graph.graph[current] = uint(graph.Weight(current) + weight)
graph.append(sequence[1:], current, weight)
}
}
func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) {
key := uint64(0)
s := sequence.Sequence()
w := sequence.Count()
init := make([]uint64, 0, 16)
var f func(start int, key uint64)
f = func(start int, key uint64) {
for i := start; i < graph.kmersize; i++ {
key <<= 2
b := iupac[s[i]]
if len(b) == 1 {
key |= b[0]
} else {
for j := 0; j < len(b); j++ {
key &= ^uint64(3)
key |= b[j]
f(i+1, key)
}
return
}
s := sequence.Sequence() // Get the sequence as a byte slice
w := sequence.Count() // Get the weight of the sequence
var initFirstKmer func(start int, key uint64)
// Initialize the first k-mer
// start is the index of the nucleotide in the k-mer to add
// key is the value of the k-mer index before adding the start nucleotide
initFirstKmer = func(start int, key uint64) {
if start == 0 {
key = 0
}
if start < graph.kmersize {
key <<= 2
b := iupac[s[start]]
for _, code := range b {
key &= ^uint64(3)
key |= code
initFirstKmer(start+1, key)
}
} else {
graph.graph[key] = uint(graph.Weight(key) + w)
graph.append(s[graph.kmersize:], key, w)
}
init = append(init, key&graph.kmermask)
}
if sequence.Len() > graph.kmersize {
f(0, key)
for _, idx := range init {
graph.append(s[graph.kmersize:], idx, w)
}
initFirstKmer(0, 0)
}
}
@ -381,43 +440,51 @@ func (graph *DeBruijnGraph) Gml() string {
`)
nodeidx := make(map[uint64]int)
nodeid := 0
for idx := range graph.graph {
node := graph.DecodeNode(idx)
buffer.WriteString(
fmt.Sprintf("node [ id \"%s\" ]\n", node),
)
n := graph.Nexts(uint64(idx))
if len(n) == 0 {
idx <<= 2
idx &= graph.kmermask
nodeid++
nodeidx[idx] = nodeid
n := graph.Nexts(idx)
p := graph.Previouses(idx)
if len(n) == 0 || len(p) == 0 {
node := graph.DecodeNode(idx)
buffer.WriteString(
fmt.Sprintf("node [ id \"%s\" \n label \"%s\" ]\n", node, node),
fmt.Sprintf("node [ id \"%d\" \n label \"%s\" ]\n", nodeid, node),
)
} else {
buffer.WriteString(
fmt.Sprintf("node [ id \"%d\" ]\n", nodeid),
)
}
}
for idx, weight := range graph.graph {
src := graph.DecodeNode(idx)
label := decode[idx&3]
idx <<= 2
idx &= graph.kmermask
dst := graph.DecodeNode(idx)
buffer.WriteString(
fmt.Sprintf(`edge [ source "%s"
target "%s"
color "#00FF00"
label "%c[%d]"
graphics [
width %f
arrow "last"
fill "#007F80"
for idx := range graph.graph {
srcid := nodeidx[idx]
n := graph.Nexts(idx)
for _, dst := range n {
dstid := nodeidx[dst]
label := decode[dst&3]
weight := graph.Weight(dst)
buffer.WriteString(
fmt.Sprintf(`edge [ source "%d"
target "%d"
color "#00FF00"
label "%c[%d]"
graphics [
width %f
arrow "last"
fill "#007F80"
]
]
]
`, src, dst, label, weight, math.Log(float64(weight))),
)
`, srcid, dstid, label, weight, math.Sqrt(float64(weight))),
)
}
}
buffer.WriteString("]\n")
return buffer.String()
@ -472,3 +539,151 @@ func (g *DeBruijnGraph) HammingDistance(kmer1, kmer2 uint64) int {
ident &= 0x5555555555555555 & g.kmermask
return bits.OnesCount64(ident)
}
type UInt64Heap []uint64
func (h UInt64Heap) Len() int { return len(h) }
func (h UInt64Heap) Less(i, j int) bool { return h[i] < h[j] }
func (h UInt64Heap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *UInt64Heap) Push(x any) {
// Push and Pop use pointer receivers because they modify the slice's length,
// not just its contents.
*h = append(*h, x.(uint64))
}
func (h *UInt64Heap) Pop() any {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
func (g *DeBruijnGraph) HaviestPath() []uint64 {
if g.HasCycle() {
return nil
}
// Initialize the distance array and visited set
distances := make(map[uint64]int)
visited := make(map[uint64]bool)
prevNodes := make(map[uint64]uint64)
heaviestNode := uint64(0)
heaviestWeight := 0
queue := &UInt64Heap{}
heap.Init(queue)
startNodes := make(map[uint64]struct{})
for _, n := range g.Heads() {
startNodes[n] = struct{}{}
heap.Push(queue, n)
distances[n] = g.Weight(n)
prevNodes[n] = 0
visited[n] = false
}
// Priority queue to keep track of nodes to visit
for len(*queue) > 0 {
// Get the node with the smallest distance
currentNode := heap.Pop(queue).(uint64)
// If the current node has already been visited, skip it
if visited[currentNode] {
continue
}
// Mark the node as visited
visited[currentNode] = true
weight := distances[currentNode]
// Update the heaviest node
if weight > heaviestWeight {
heaviestWeight = weight
heaviestNode = currentNode
}
if currentNode == 0 {
log.Warn("current node is 0")
}
// Update the distance of the neighbors
nextNodes := g.Nexts(currentNode)
for _, nextNode := range nextNodes {
if nextNode == 0 {
log.Warn("next node is 0")
}
weight := g.Weight(nextNode) + distances[currentNode]
if distances[nextNode] < weight {
distances[nextNode] = weight
prevNodes[nextNode] = currentNode
visited[nextNode] = false
heap.Push(queue, nextNode)
// Keep track of the node with the heaviest weight
if weight > heaviestWeight {
heaviestWeight = weight
heaviestNode = nextNode
}
}
}
}
log.Infof("Heaviest node: %d [%v]", heaviestNode, heaviestWeight)
// Reconstruct the path from the start node to the heaviest node found
heaviestPath := make([]uint64, 0)
currentNode := heaviestNode
for _, ok := startNodes[currentNode]; !ok && !slices.Contains(heaviestPath, currentNode); _, ok = startNodes[currentNode] {
heaviestPath = append(heaviestPath, currentNode)
//log.Infof("Current node: %d <- %d", currentNode, prevNodes[currentNode])
currentNode = prevNodes[currentNode]
}
if slices.Contains(heaviestPath, currentNode) {
log.Fatalf("Cycle detected %v -> %v (%v) len(%v)", heaviestPath, currentNode, startNodes, len(heaviestPath))
return nil
}
heaviestPath = append(heaviestPath, currentNode)
// Reverse the path
slices.Reverse(heaviestPath)
return heaviestPath
}
func (g *DeBruijnGraph) HasCycle() bool {
// Initialize the visited and stack arrays
visited := make(map[uint64]bool)
stack := make(map[uint64]bool)
// Helper function to perform DFS
var dfs func(node uint64) bool
dfs = func(node uint64) bool {
visited[node] = true
stack[node] = true
nextNodes := g.Nexts(node)
for _, nextNode := range nextNodes {
if !visited[nextNode] {
if dfs(nextNode) {
return true
}
} else if stack[nextNode] {
return true
}
}
stack[node] = false
return false
}
// Perform DFS on each node to check for cycles
for node := range g.graph {
if !visited[node] {
if dfs(node) {
return true
}
}
}
return false
}