mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Add obiminion first version
Former-commit-id: aa5ace7bd4d2266333715fca7094d1c3cbbb5e6d
This commit is contained in:
33
cmd/obitools/obiminion/main.go
Normal file
33
cmd/obitools/obiminion/main.go
Normal file
@ -0,0 +1,33 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiminion"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
)
|
||||
|
||||
func main() {
|
||||
optionParser := obioptions.GenerateOptionParser(obiminion.OptionSet)
|
||||
|
||||
_, args := optionParser(os.Args)
|
||||
|
||||
fs, err := obiconvert.CLIReadBioSequences(args...)
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Cannot open file (%v)", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
cleaned := obiminion.CLIOBIMinion(fs)
|
||||
|
||||
obiconvert.CLIWriteBioSequences(cleaned, true)
|
||||
|
||||
obiiter.WaitForLastPipe()
|
||||
|
||||
}
|
@ -43,7 +43,7 @@ func main() {
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
unique := obiuniq.Unique(sequences)
|
||||
unique := obiuniq.CLIUnique(sequences)
|
||||
obiconvert.CLIWriteBioSequences(unique, true)
|
||||
|
||||
obiiter.WaitForLastPipe()
|
||||
|
@ -58,182 +58,3 @@ var _empty = encodeValues(0, 0, false)
|
||||
var _out = encodeValues(0, 30000, true)
|
||||
var _notavail = encodeValues(0, 30000, false)
|
||||
|
||||
// func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
|
||||
|
||||
// lA := seqA.Len()
|
||||
// lB := seqB.Len()
|
||||
|
||||
// // Ensure that A is the longest
|
||||
// if lA < lB {
|
||||
// seqA, seqB = seqB, seqA
|
||||
// lA, lB = lB, lA
|
||||
// }
|
||||
|
||||
// if maxError == -1 {
|
||||
// maxError = lA * 2
|
||||
// }
|
||||
|
||||
// delta := lA - lB
|
||||
|
||||
// // The difference of length is larger the maximum allowed errors
|
||||
// if delta > maxError {
|
||||
// return -1, -1
|
||||
// }
|
||||
|
||||
// // Doit-on vraiment diviser par deux ??? pas certain
|
||||
// extra := (maxError - delta) + 1
|
||||
|
||||
// even := 1 + delta + 2*extra
|
||||
// width := 2*even - 1
|
||||
|
||||
// if buffer == nil {
|
||||
// var local []uint64
|
||||
// buffer = &local
|
||||
// }
|
||||
|
||||
// if cap(*buffer) < 2*width {
|
||||
// *buffer = make([]uint64, 3*width)
|
||||
// }
|
||||
|
||||
// previous := (*buffer)[0:width]
|
||||
// current := (*buffer)[width:(2 * width)]
|
||||
|
||||
// previous[extra] = _empty
|
||||
// previous[extra+even] = encodeValues(0, 1, false)
|
||||
// previous[extra+even-1] = encodeValues(0, 1, false)
|
||||
|
||||
// N := lB + ((delta) >> 1)
|
||||
|
||||
// bA := seqA.Sequence()
|
||||
// bB := seqB.Sequence()
|
||||
|
||||
// // log.Println("N = ", N)
|
||||
|
||||
// for y := 1; y <= N; y++ {
|
||||
// // in_matrix := false
|
||||
// x1 := y - lB + extra
|
||||
// x2 := extra - y
|
||||
// xs := obiutils.MaxInt(obiutils.MaxInt(x1, x2), 0)
|
||||
|
||||
// x1 = y + extra
|
||||
// x2 = lA + extra - y
|
||||
// xf := obiutils.MinInt(obiutils.MinInt(x1, x2), even-1) + 1
|
||||
|
||||
// for x := xs; x < xf; x++ {
|
||||
|
||||
// i := y - x + extra
|
||||
// j := y + x - extra
|
||||
|
||||
// var Sdiag, Sleft, Sup uint64
|
||||
|
||||
// switch {
|
||||
|
||||
// case i == 0:
|
||||
// Sup = _notavail
|
||||
// Sdiag = _notavail
|
||||
// Sleft = encodeValues(0, j-1, false)
|
||||
// case j == 0:
|
||||
// Sup = encodeValues(0, i-1, false)
|
||||
// Sdiag = _notavail
|
||||
// Sleft = _notavail
|
||||
// default:
|
||||
// Sdiag = previous[x]
|
||||
|
||||
// if bA[j-1] == bB[i-1] {
|
||||
// Sdiag = _incscore(Sdiag)
|
||||
// }
|
||||
|
||||
// if x < (even - 1) {
|
||||
// Sup = previous[x+even]
|
||||
// } else {
|
||||
// Sup = _out
|
||||
// }
|
||||
// if x > 0 {
|
||||
// Sleft = previous[x+even-1]
|
||||
// } else {
|
||||
// Sleft = _out
|
||||
// }
|
||||
// }
|
||||
|
||||
// var score uint64
|
||||
// switch {
|
||||
// case Sdiag >= Sup && Sdiag >= Sleft:
|
||||
// score = Sdiag
|
||||
// case Sup >= Sleft:
|
||||
// score = Sup
|
||||
// default:
|
||||
// score = Sleft
|
||||
// }
|
||||
|
||||
// if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) {
|
||||
// score = _setout(score)
|
||||
// }
|
||||
|
||||
// current[x] = _incpath(score)
|
||||
// }
|
||||
// // . 9 10 + 2 - 1
|
||||
// x1 = y - lB + extra + even
|
||||
// x2 = extra - y + even - 1
|
||||
// xs = obiutils.MaxInt(obiutils.MaxInt(x1, x2), even)
|
||||
|
||||
// x1 = y + extra + even
|
||||
// x2 = lA + extra - y + even - 1
|
||||
// xf = obiutils.MinInt(obiutils.MinInt(x1, x2), width-1) + 1
|
||||
|
||||
// for x := xs; x < xf; x++ {
|
||||
|
||||
// i := y - x + extra + even
|
||||
// j := y + x - extra - even + 1
|
||||
|
||||
// var Sdiag, Sleft, Sup uint64
|
||||
|
||||
// switch {
|
||||
|
||||
// case i == 0:
|
||||
// Sup = _notavail
|
||||
// Sdiag = _notavail
|
||||
// Sleft = encodeValues(0, j-1, false)
|
||||
// case j == 0:
|
||||
// Sup = encodeValues(0, i-1, false)
|
||||
// Sdiag = _notavail
|
||||
// Sleft = _notavail
|
||||
// default:
|
||||
// Sdiag = previous[x]
|
||||
// if bA[j-1] == bB[i-1] {
|
||||
// Sdiag = _incscore(Sdiag)
|
||||
// }
|
||||
|
||||
// Sleft = current[x-even]
|
||||
// Sup = current[x-even+1]
|
||||
|
||||
// }
|
||||
|
||||
// var score uint64
|
||||
// switch {
|
||||
// case Sdiag >= Sup && Sdiag >= Sleft:
|
||||
// score = Sdiag
|
||||
// case Sup >= Sleft:
|
||||
// score = Sup
|
||||
// default:
|
||||
// score = Sleft
|
||||
// }
|
||||
|
||||
// if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) {
|
||||
// score = _setout(score)
|
||||
// }
|
||||
|
||||
// current[x] = _incpath(score)
|
||||
// }
|
||||
|
||||
// previous, current = current, previous
|
||||
|
||||
// }
|
||||
|
||||
// s, l, o := decodeValues(previous[(delta%2)*even+extra+(delta>>1)])
|
||||
|
||||
// if o {
|
||||
// return -1, -1
|
||||
// }
|
||||
|
||||
// return s, l
|
||||
// }
|
||||
|
@ -130,11 +130,11 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
|
||||
// in_matrix := false
|
||||
x1 := y - lB + extra
|
||||
x2 := extra - y
|
||||
xs := obiutils.MaxInt(obiutils.MaxInt(x1, x2), 0)
|
||||
xs := obiutils.Max(obiutils.Max(x1, x2), 0)
|
||||
|
||||
x1 = y + extra
|
||||
x2 = lA + extra - y
|
||||
xf := obiutils.MinInt(obiutils.MinInt(x1, x2), even-1) + 1
|
||||
xf := obiutils.Min(obiutils.Min(x1, x2), even-1) + 1
|
||||
|
||||
for x := xs; x < xf; x++ {
|
||||
|
||||
@ -222,11 +222,11 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
|
||||
// . 9 10 + 2 - 1
|
||||
x1 = y - lB + extra + even
|
||||
x2 = extra - y + even - 1
|
||||
xs = obiutils.MaxInt(obiutils.MaxInt(x1, x2), even)
|
||||
xs = obiutils.Max(obiutils.Max(x1, x2), even)
|
||||
|
||||
x1 = y + extra + even
|
||||
x2 = lA + extra - y + even - 1
|
||||
xf = obiutils.MinInt(obiutils.MinInt(x1, x2), width-1) + 1
|
||||
xf = obiutils.Min(obiutils.Min(x1, x2), width-1) + 1
|
||||
|
||||
for x := xs; x < xf; x++ {
|
||||
|
||||
@ -348,16 +348,15 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
|
||||
// - Matching: 1
|
||||
// - Mismatch or gap: 0
|
||||
//
|
||||
// Compared to FastLCSScoreByte the length of the shortest alignment returned does not include the end-gaps.
|
||||
// Parameters:
|
||||
// - seqA: The first bio sequence.
|
||||
// - seqB: The second bio sequence.
|
||||
// - maxError: The maximum allowed error between the sequences. If set to -1, no limit is applied.
|
||||
// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created.
|
||||
//
|
||||
// if buffer != nil, the buffer is used to store intermediate results.
|
||||
// Otherwise, a new buffer is allocated.
|
||||
//
|
||||
// seqA: The first bio sequence.
|
||||
// seqB: The second bio sequence.
|
||||
// maxError: The maximum allowed error between the sequences.
|
||||
// buffer: A buffer to store intermediate results.
|
||||
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
|
||||
// Returns:
|
||||
// - The score of the longest common subsequence.
|
||||
// - The length of the shortest alignment corresponding to the LCS.
|
||||
func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
|
||||
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer)
|
||||
}
|
||||
@ -372,14 +371,16 @@ func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uin
|
||||
// - Matching: 1
|
||||
// - Mismatch or gap: 0
|
||||
//
|
||||
// if buffer != nil, the buffer is used to store intermediate results.
|
||||
// Otherwise, a new buffer is allocated.
|
||||
// Parameters:
|
||||
// - seqA: The first bio sequence.
|
||||
// - seqB: The second bio sequence.
|
||||
// - maxError: The maximum allowed error between the sequences. If set to -1, no limit is applied.
|
||||
// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created.
|
||||
//
|
||||
// seqA: The first bio sequence.
|
||||
// seqB: The second bio sequence.
|
||||
// maxError: The maximum allowed error between the sequences.
|
||||
// buffer: A buffer to store intermediate results.
|
||||
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
|
||||
// Returns:
|
||||
// - The score of the longest common subsequence.
|
||||
// - The length of the shortest alignment corresponding to the LCS.
|
||||
func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
|
||||
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer)
|
||||
}
|
||||
|
||||
|
@ -348,8 +348,8 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (
|
||||
|
||||
start = best[0] - nerr
|
||||
end = best[0] + int(pattern.pointer.pointer.patlen) + nerr
|
||||
start = obiutils.MaxInt(start, 0)
|
||||
end = obiutils.MinInt(end, sequence.Len())
|
||||
start = obiutils.Max(start, 0)
|
||||
end = obiutils.Min(end, sequence.Len())
|
||||
|
||||
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
|
||||
frg := sequence.pointer.reference.Sequence()[start:end]
|
||||
@ -387,8 +387,8 @@ func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int)
|
||||
if m[2] > 0 && pattern.pointer.pointer.hasIndel {
|
||||
start := m[0] - m[2]
|
||||
end := m[0] + int(pattern.pointer.pointer.patlen) + m[2]
|
||||
start = obiutils.MaxInt(start, 0)
|
||||
end = obiutils.MinInt(end, sequence.Len())
|
||||
start = obiutils.Max(start, 0)
|
||||
end = obiutils.Min(end, sequence.Len())
|
||||
|
||||
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
|
||||
frg := sequence.pointer.reference.Sequence()[start:end]
|
||||
|
327
pkg/obigraph/graph.go
Normal file
327
pkg/obigraph/graph.go
Normal file
@ -0,0 +1,327 @@
|
||||
package obigraph
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
"text/template"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type Edge[T any] struct {
|
||||
From int
|
||||
To int
|
||||
Data *T
|
||||
}
|
||||
|
||||
type Edges[T any] map[int]map[int]*T
|
||||
type Graph[V any, T any] struct {
|
||||
Name string
|
||||
Vertices *[]V
|
||||
Edges *Edges[T]
|
||||
ReverseEdges *Edges[T]
|
||||
VertexWeight func(int) float64
|
||||
VertexId func(int) string
|
||||
EdgeWeight func(int, int) float64
|
||||
}
|
||||
|
||||
func NewEdges[T any]() *Edges[T] {
|
||||
e := make(map[int]map[int]*T)
|
||||
|
||||
return (*Edges[T])(&e)
|
||||
}
|
||||
|
||||
// AddEdge adds an edge to the graph between two vertices.
|
||||
//
|
||||
// Parameters:
|
||||
// - from: the index of the starting vertex.
|
||||
// - to: the index of the ending vertex.
|
||||
// - data: a pointer to the data associated with the edge.
|
||||
func (e *Edges[T]) AddEdge(from, to int, data *T) {
|
||||
|
||||
fnode, ok := (*e)[from]
|
||||
if !ok {
|
||||
fnode = make(map[int]*T)
|
||||
(*e)[from] = fnode
|
||||
}
|
||||
fnode[to] = data
|
||||
}
|
||||
|
||||
// NewGraph creates a new graph with the specified name and vertices.
|
||||
//
|
||||
// Parameters:
|
||||
// - name: a string representing the name of the graph.
|
||||
// - vertices: a slice of vertices of type V.
|
||||
//
|
||||
// Returns:
|
||||
// - Graph[V, T]: the newly created graph.
|
||||
func NewGraph[V, T any](name string, vertices *[]V) *Graph[V, T] {
|
||||
return &Graph[V, T]{
|
||||
Name: name,
|
||||
Vertices: vertices,
|
||||
Edges: NewEdges[T](),
|
||||
ReverseEdges: NewEdges[T](),
|
||||
VertexWeight: func(i int) float64 {
|
||||
return 1.0
|
||||
},
|
||||
EdgeWeight: func(i, j int) float64 {
|
||||
return 1.0
|
||||
},
|
||||
VertexId: func(i int) string {
|
||||
return fmt.Sprintf("V%d", i)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// AddEdge adds an edge between two vertices in the graph.
|
||||
//
|
||||
// Parameters:
|
||||
// - from: the index of the starting vertex.
|
||||
// - to: the index of the ending vertex.
|
||||
// - data: a pointer to the data associated with the edge.
|
||||
func (g *Graph[V, T]) AddEdge(from, to int, data *T) {
|
||||
lv := len(*g.Vertices)
|
||||
if from >= lv || to >= lv {
|
||||
log.Errorf("out of bounds vertex index: %d or %d (max: %d)", from, to, lv-1)
|
||||
}
|
||||
|
||||
g.Edges.AddEdge(from, to, data)
|
||||
g.Edges.AddEdge(to, from, data)
|
||||
g.ReverseEdges.AddEdge(to, from, data)
|
||||
g.ReverseEdges.AddEdge(from, to, data)
|
||||
}
|
||||
|
||||
// AddDirectedEdge adds a directed edge from one vertex to another in the graph.
|
||||
//
|
||||
// Parameters:
|
||||
// - from: an integer representing the index of the starting vertex.
|
||||
// - to: an integer representing the index of the ending vertex.
|
||||
// - data: a pointer to the data associated with the edge.
|
||||
func (g *Graph[V, T]) AddDirectedEdge(from, to int, data *T) {
|
||||
lv := len(*g.Vertices)
|
||||
|
||||
if from >= lv || to >= lv {
|
||||
log.Errorf("out of bounds vertex index: %d or %d (max: %d)", from, to, lv-1)
|
||||
}
|
||||
|
||||
g.Edges.AddEdge(from, to, data)
|
||||
g.ReverseEdges.AddEdge(to, from, data)
|
||||
}
|
||||
|
||||
// SetAsDirectedEdge sets the edge from one vertex to another as directed in the graph.
|
||||
//
|
||||
// Parameters:
|
||||
// - from: an integer representing the index of the starting vertex.
|
||||
// - to: an integer representing the index of the ending vertex.
|
||||
func (g *Graph[V, T]) SetAsDirectedEdge(from, to int) {
|
||||
lv := len(*g.Vertices)
|
||||
|
||||
if from >= lv || to >= lv {
|
||||
log.Errorf("out of bounds vertex index: %d or %d (max: %d)", from, to, lv-1)
|
||||
}
|
||||
|
||||
if _, ok := (*g.Edges)[from][to]; ok {
|
||||
if _, ok := (*g.Edges)[to][from]; ok {
|
||||
delete((*g.Edges)[to], from)
|
||||
delete((*g.Edges)[from], to)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
log.Error("no edge from ", from, " to ", to)
|
||||
|
||||
}
|
||||
|
||||
// Neighbors generates a list of neighbor vertices for a given vertex index in the graph.
|
||||
//
|
||||
// Parameters:
|
||||
// - v: an integer representing the index of the vertex.
|
||||
// Returns:
|
||||
// - []int: a list of neighbor vertices.
|
||||
func (g *Graph[V, T]) Neighbors(v int) []int {
|
||||
if neighbors, ok := (*g.Edges)[v]; ok {
|
||||
rep := make([]int, 0, len(neighbors))
|
||||
|
||||
for k := range neighbors {
|
||||
rep = append(rep, k)
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
}
|
||||
|
||||
// Degree calculates the degree of a vertex in a graph.
|
||||
//
|
||||
// Parameters:
|
||||
// - v: an integer representing the index of the vertex.
|
||||
//
|
||||
// Returns:
|
||||
// - an integer representing the degree of the vertex.
|
||||
func (g *Graph[V, T]) Degree(v int) int {
|
||||
if neighbors, ok := (*g.Edges)[v]; ok {
|
||||
return len(neighbors)
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// Parents returns a list of parent vertices for a given vertex index in the graph.
|
||||
//
|
||||
// Parameters:
|
||||
// - v: an integer representing the index of the vertex.
|
||||
//
|
||||
// Returns:
|
||||
// - []int: a list of parent vertices.
|
||||
func (g *Graph[V, T]) Parents(v int) []int {
|
||||
if parents, ok := (*g.ReverseEdges)[v]; ok {
|
||||
rep := make([]int, 0, len(parents))
|
||||
|
||||
for k := range parents {
|
||||
rep = append(rep, k)
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ParentDegree calculates the degree of a vertex in a graph by counting the number of its parent vertices.
|
||||
//
|
||||
// Parameters:
|
||||
// - v: an integer representing the index of the vertex.
|
||||
//
|
||||
// Returns:
|
||||
// - an integer representing the degree of the vertex.
|
||||
func (g *Graph[V, T]) ParentDegree(v int) int {
|
||||
if parents, ok := (*g.ReverseEdges)[v]; ok {
|
||||
return len(parents)
|
||||
}
|
||||
return 0
|
||||
|
||||
}
|
||||
|
||||
type gml_graph[V any, T any] struct {
|
||||
Graph *Graph[V, T]
|
||||
As_directed bool
|
||||
Min_degree int
|
||||
Threshold float64
|
||||
Scale int
|
||||
}
|
||||
|
||||
// Gml generates a GML representation of the graph.
|
||||
//
|
||||
// as_directed: whether the graph should be treated as directed or undirected.
|
||||
// threshold: the threshold value.
|
||||
// scale: the scaling factor.
|
||||
// string: the GML representation of the graph.
|
||||
func (g *Graph[V, T]) Gml(as_directed bool, min_degree int, threshold float64, scale int) string {
|
||||
// (*seqs)[1].Count
|
||||
var gml bytes.Buffer
|
||||
|
||||
data := gml_graph[V, T]{
|
||||
Graph: g,
|
||||
As_directed: as_directed,
|
||||
Min_degree: min_degree,
|
||||
Threshold: threshold,
|
||||
Scale: scale,
|
||||
}
|
||||
|
||||
digraphTpl := template.New("gml_digraph")
|
||||
|
||||
digraph := ` {{$context := .}}
|
||||
graph [
|
||||
comment "{{ if $context.As_directed }}Directed graph{{ else }}Undirected graph{{ end }} {{ Name }}"
|
||||
directed {{ if $context.As_directed }}1{{ else }}0{{ end }}
|
||||
|
||||
{{range $index, $data:= $context.Graph.Vertices}}
|
||||
{{ if (ge (Degree $index) $context.Min_degree)}}
|
||||
node [
|
||||
id {{$index}}
|
||||
graphics [
|
||||
type "{{ Shape $index }}"
|
||||
h {{ Sqrt (VertexWeight $index) }}
|
||||
w {{ Sqrt (VertexWeight $index) }}
|
||||
]
|
||||
]
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
|
||||
{{range $source, $data:= $context.Graph.Edges}}
|
||||
{{range $target, $edge:= $data}}
|
||||
{{ if and (ge $source $context.Min_degree) (ge $target $context.Min_degree) (or $context.As_directed (lt $source $target))}}
|
||||
edge [ source {{$source}}
|
||||
target {{$target}}
|
||||
color "#00FF00"
|
||||
]
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
|
||||
]
|
||||
`
|
||||
|
||||
tmpl, err := digraphTpl.Funcs(template.FuncMap{
|
||||
"Sqrt": func(i float64) int { return scale * int(math.Floor(math.Sqrt(i))) },
|
||||
"Name": func() string { return g.Name },
|
||||
"VertexId": func(i int) string { return g.VertexId(i) },
|
||||
"Degree": func(i int) int { return g.Degree(i) },
|
||||
"VertexWeight": func(i int) float64 { return g.VertexWeight(i) },
|
||||
"Shape": func(i int) string {
|
||||
if g.VertexWeight(i) >= threshold {
|
||||
return "circle"
|
||||
} else {
|
||||
return "rectangle"
|
||||
}
|
||||
},
|
||||
}).Parse(digraph)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = tmpl.Execute(&gml, data)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return gml.String()
|
||||
|
||||
}
|
||||
|
||||
// WriteGml writes the GML representation of the graph to an io.Writer.
|
||||
//
|
||||
// w: the io.Writer to write the GML representation to.
|
||||
// as_directed: whether the graph should be treated as directed or undirected.
|
||||
// threshold: the threshold value.
|
||||
// scale: the scaling factor.
|
||||
func (g *Graph[V, T]) WriteGml(w io.Writer, as_directed bool, min_degree int, threshold float64, scale int) {
|
||||
|
||||
_, err := w.Write([]byte(g.Gml(as_directed, min_degree, threshold, scale)))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// WriteGmlFile writes the graph in GML format to the specified file.
|
||||
//
|
||||
// filename: the name of the file to write the GML representation to.
|
||||
// as_directed: whether the graph should be treated as directed or undirected.
|
||||
// threshold: the threshold value.
|
||||
// scale: the scaling factor.
|
||||
func (g *Graph[V, T]) WriteGmlFile(filename string, as_directed bool, min_degree int, threshold float64, scale int) {
|
||||
|
||||
f, err := os.Create(filename)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer f.Close()
|
||||
g.WriteGml(f, as_directed, min_degree, threshold, scale)
|
||||
}
|
104
pkg/obigraph/graphbuffer.go
Normal file
104
pkg/obigraph/graphbuffer.go
Normal file
@ -0,0 +1,104 @@
|
||||
package obigraph
|
||||
|
||||
import (
|
||||
"io"
|
||||
"os"
|
||||
)
|
||||
|
||||
type GraphBuffer[V, T any] struct {
|
||||
Graph *Graph[V, T]
|
||||
Channel chan Edge[T]
|
||||
}
|
||||
|
||||
// NewGraphBuffer creates a new GraphBuffer with the given name and vertices.
|
||||
//
|
||||
// Parameters:
|
||||
// - name: the name of the GraphBuffer.
|
||||
// - vertices: a slice of vertices to initialize the GraphBuffer.
|
||||
//
|
||||
// Returns:
|
||||
// - GraphBuffer[V, T]: the newly created GraphBuffer.
|
||||
func NewGraphBuffer[V, T any](name string, vertices *[]V) *GraphBuffer[V, T] {
|
||||
buffer := GraphBuffer[V, T]{
|
||||
Graph: NewGraph[V, T](name, vertices),
|
||||
Channel: make(chan Edge[T]),
|
||||
}
|
||||
|
||||
go func() {
|
||||
for edge := range buffer.Channel {
|
||||
buffer.Graph.AddEdge(edge.From, edge.To, edge.Data)
|
||||
}
|
||||
}()
|
||||
|
||||
return &buffer
|
||||
}
|
||||
|
||||
// AddEdge adds an edge to the GraphBuffer.
|
||||
//
|
||||
// Parameters:
|
||||
// - from: the index of the starting vertex.
|
||||
// - to: the index of the ending vertex.
|
||||
// - data: a pointer to the data associated with the edge.
|
||||
func (g *GraphBuffer[V, T]) AddEdge(from, to int, data *T) {
|
||||
g.Channel <- Edge[T]{
|
||||
From: from,
|
||||
To: to,
|
||||
Data: data,
|
||||
}
|
||||
}
|
||||
|
||||
// AddDirectedEdge adds a directed edge from one vertex to another in the GraphBuffer.
|
||||
//
|
||||
// Parameters:
|
||||
// - from: the index of the starting vertex.
|
||||
// - to: the index of the ending vertex.
|
||||
// - data: a pointer to the data associated with the edge.
|
||||
func (g *GraphBuffer[V, T]) AddDirectedEdge(from, to int, data *T) {
|
||||
g.Channel <- Edge[T]{
|
||||
From: from,
|
||||
To: to,
|
||||
Data: data,
|
||||
}
|
||||
}
|
||||
|
||||
// Gml generates a GML representation of the graph.
|
||||
//
|
||||
// as_directed: whether the graph should be treated as directed or undirected.
|
||||
// min_degree: the minimum degree of vertices to include in the GML representation.
|
||||
// threshold: the threshold value.
|
||||
// scale: the scaling factor.
|
||||
// string: the GML representation of the graph.
|
||||
func (g *GraphBuffer[V, T]) Gml(as_directed bool, min_degree int, threshold float64, scale int) string {
|
||||
return g.Graph.Gml(as_directed, min_degree, threshold, scale)
|
||||
}
|
||||
|
||||
func (g *GraphBuffer[V, T]) WriteGmlFile(filename string, as_directed bool, min_degree int, threshold float64, scale int) {
|
||||
|
||||
f, err := os.Create(filename)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer f.Close()
|
||||
g.WriteGml(f, as_directed, min_degree, threshold, scale)
|
||||
}
|
||||
|
||||
// WriteGml writes the GML representation of the graph to an io.Writer.
|
||||
//
|
||||
// w: the io.Writer to write the GML representation to.
|
||||
// as_directed: whether the graph should be treated as directed or undirected.
|
||||
// min_degree: the minimum degree of vertices to include in the GML representation.
|
||||
// threshold: the threshold value.
|
||||
// scale: the scaling factor.
|
||||
func (g *GraphBuffer[V, T]) WriteGml(w io.Writer, as_directed bool, min_degree int, threshold float64, scale int) {
|
||||
_, err := w.Write([]byte(g.Gml(as_directed, min_degree, threshold, scale)))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Close closes the GraphBuffer by closing its channel.
|
||||
//
|
||||
// No parameters.
|
||||
func (g *GraphBuffer[V, T]) Close() {
|
||||
close(g.Channel)
|
||||
}
|
@ -30,7 +30,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
|
||||
news = append(news, s)
|
||||
} else {
|
||||
for i := 0; i < s.Len(); i += step {
|
||||
end := obiutils.MinInt(i+length, s.Len())
|
||||
end := obiutils.Min(i+length, s.Len())
|
||||
fusion := false
|
||||
if (s.Len() - end) < step {
|
||||
end = s.Len()
|
||||
|
@ -9,9 +9,10 @@ import (
|
||||
|
||||
type Table4mer [256]uint16
|
||||
|
||||
func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer {
|
||||
|
||||
iternal_buffer := Encode4mer(seq, buffer)
|
||||
|
||||
func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer {
|
||||
iternal_buffer := Encode4mer(seq, buffer) // The slice of 4-mer codes
|
||||
|
||||
if counts == nil {
|
||||
var w Table4mer
|
||||
@ -19,7 +20,7 @@ func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Tabl
|
||||
}
|
||||
|
||||
// Every cells of the counter is set to zero
|
||||
for i := 0; i < 256; i++ {
|
||||
for i := 0; i < 256; i++ { // 256 is the number of possible 4-mer codes
|
||||
(*counts)[i] = 0
|
||||
}
|
||||
|
||||
@ -32,7 +33,7 @@ func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Tabl
|
||||
func Common4Mer(count1, count2 *Table4mer) int {
|
||||
sum := 0
|
||||
for i := 0; i < 256; i++ {
|
||||
sum += int(obiutils.MinUInt16((*count1)[i], (*count2)[i]))
|
||||
sum += int(obiutils.Min((*count1)[i], (*count2)[i]))
|
||||
}
|
||||
return sum
|
||||
}
|
||||
@ -48,7 +49,7 @@ func Sum4Mer(count *Table4mer) int {
|
||||
func LCS4MerBounds(count1, count2 *Table4mer) (int, int) {
|
||||
s1 := Sum4Mer(count1)
|
||||
s2 := Sum4Mer(count2)
|
||||
smin := obiutils.MinInt(s1, s2)
|
||||
smin := obiutils.Min(s1, s2)
|
||||
|
||||
cw := Common4Mer(count1, count2)
|
||||
|
||||
@ -65,7 +66,7 @@ func LCS4MerBounds(count1, count2 *Table4mer) (int, int) {
|
||||
func Error4MerBounds(count1, count2 *Table4mer) (int, int) {
|
||||
s1 := Sum4Mer(count1)
|
||||
s2 := Sum4Mer(count2)
|
||||
smax := obiutils.MaxInt(s1, s2)
|
||||
smax := obiutils.Max(s1, s2)
|
||||
|
||||
cw := Common4Mer(count1, count2)
|
||||
|
||||
|
@ -2,13 +2,16 @@ package obikmer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"container/heap"
|
||||
"fmt"
|
||||
"math"
|
||||
"math/bits"
|
||||
"slices"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"github.com/daichi-m/go18ds/sets/linkedhashset"
|
||||
"github.com/daichi-m/go18ds/stacks/arraystack"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type KmerIdx32 uint32
|
||||
@ -49,36 +52,60 @@ type KmerIdx_t interface {
|
||||
}
|
||||
|
||||
type DeBruijnGraph struct {
|
||||
kmersize int
|
||||
kmermask uint64
|
||||
prevc uint64
|
||||
kmersize int // k-mer size
|
||||
kmermask uint64 // mask used to set to 0 the bits that are not in the k-mer
|
||||
prevc uint64 //
|
||||
prevg uint64
|
||||
prevt uint64
|
||||
graph map[uint64]uint
|
||||
graph map[uint64]uint // Kmer are encoded as uint64 with 2 bits per character
|
||||
}
|
||||
|
||||
// MakeDeBruijnGraph creates a De Bruijn Graph with the specified k-mer size.
|
||||
//
|
||||
// Parameters:
|
||||
//
|
||||
// kmersize int - the size of the k-mers
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// *DeBruijnGraph - a pointer to the created De Bruijn's Graph
|
||||
func MakeDeBruijnGraph(kmersize int) *DeBruijnGraph {
|
||||
g := DeBruijnGraph{
|
||||
kmersize: kmersize,
|
||||
kmermask: ^(^uint64(0) << (uint64(kmersize+1) * 2)),
|
||||
prevc: uint64(1) << (uint64(kmersize) * 2),
|
||||
prevg: uint64(2) << (uint64(kmersize) * 2),
|
||||
prevt: uint64(3) << (uint64(kmersize) * 2),
|
||||
kmermask: ^(^uint64(0) << (uint64(kmersize) * 2)), // k-mer mask used to set to 0 the bits that are not in the k-mer
|
||||
prevc: uint64(1) << (uint64(kmersize-1) * 2),
|
||||
prevg: uint64(2) << (uint64(kmersize-1) * 2),
|
||||
prevt: uint64(3) << (uint64(kmersize-1) * 2),
|
||||
graph: make(map[uint64]uint),
|
||||
}
|
||||
|
||||
return &g
|
||||
}
|
||||
|
||||
// KmerSize returns the size of the k-mers in the DeBruijn graph.
|
||||
//
|
||||
// This function takes no parameters.
|
||||
// It returns an integer representing the size of the k-mers.
|
||||
func (g *DeBruijnGraph) KmerSize() int {
|
||||
return g.kmersize
|
||||
}
|
||||
|
||||
// Len returns the length of the graph.
|
||||
//
|
||||
// This function takes no parameters.
|
||||
// It returns an integer representing the number of nodes in the graph.
|
||||
func (g *DeBruijnGraph) Len() int {
|
||||
return len(g.graph)
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) MaxLink() int {
|
||||
// MaxWeight returns the maximum weight of a node from the DeBruijn's Graph.
|
||||
//
|
||||
// It iterates over each count in the graph map and updates the max value if the current count is greater.
|
||||
// Finally, it returns the maximum weight as an integer.
|
||||
//
|
||||
// Returns:
|
||||
// - int: the maximum weight value.
|
||||
func (g *DeBruijnGraph) MaxWeight() int {
|
||||
max := uint(0)
|
||||
for _, count := range g.graph {
|
||||
if count > max {
|
||||
@ -89,8 +116,12 @@ func (g *DeBruijnGraph) MaxLink() int {
|
||||
return int(max)
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) LinkSpectrum() []int {
|
||||
max := g.MaxLink()
|
||||
// WeightSpectrum calculates the weight spectrum of nodes in the DeBruijn's graph.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns an array of integers representing the weight spectrum.
|
||||
func (g *DeBruijnGraph) WeightSpectrum() []int {
|
||||
max := g.MaxWeight()
|
||||
spectrum := make([]int, max+1)
|
||||
for _, count := range g.graph {
|
||||
spectrum[int(count)]++
|
||||
@ -99,7 +130,10 @@ func (g *DeBruijnGraph) LinkSpectrum() []int {
|
||||
return spectrum
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) FilterMin(min int) {
|
||||
// FilterMinWeight filters the DeBruijnGraph by removing nodes with weight less than the specified minimum.
|
||||
//
|
||||
// min: an integer representing the minimum count threshold.
|
||||
func (g *DeBruijnGraph) FilterMinWeight(min int) {
|
||||
umin := uint(min)
|
||||
for idx, count := range g.graph {
|
||||
if count < umin {
|
||||
@ -109,8 +143,12 @@ func (g *DeBruijnGraph) FilterMin(min int) {
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) Previouses(index uint64) []uint64 {
|
||||
if _, ok := g.graph[index]; !ok {
|
||||
log.Panicf("k-mer %s (index %d) is not in graph", g.DecodeNode(index), index)
|
||||
}
|
||||
|
||||
rep := make([]uint64, 0, 4)
|
||||
index = index >> 2
|
||||
index >>= 2
|
||||
|
||||
if _, ok := g.graph[index]; ok {
|
||||
rep = append(rep, index)
|
||||
@ -135,6 +173,10 @@ func (g *DeBruijnGraph) Previouses(index uint64) []uint64 {
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) Nexts(index uint64) []uint64 {
|
||||
if _, ok := g.graph[index]; !ok {
|
||||
log.Panicf("k-mer %s (index %d) is not in graph", g.DecodeNode(index), index)
|
||||
}
|
||||
|
||||
rep := make([]uint64, 0, 4)
|
||||
index = (index << 2) & g.kmermask
|
||||
|
||||
@ -160,11 +202,11 @@ func (g *DeBruijnGraph) Nexts(index uint64) []uint64 {
|
||||
return rep
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) {
|
||||
func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, int, bool) {
|
||||
ns := g.Nexts(index)
|
||||
|
||||
if len(ns) == 0 {
|
||||
return uint64(0), false
|
||||
return uint64(0), 0, false
|
||||
}
|
||||
|
||||
max := uint(0)
|
||||
@ -177,7 +219,34 @@ func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) {
|
||||
}
|
||||
}
|
||||
|
||||
return rep, true
|
||||
return rep, int(max), true
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) Heads() []uint64 {
|
||||
rep := make([]uint64, 0, 10)
|
||||
|
||||
for k := range g.graph {
|
||||
if len(g.Previouses(k)) == 0 {
|
||||
rep = append(rep, k)
|
||||
}
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) MaxHead() (uint64, int, bool) {
|
||||
rep := uint64(0)
|
||||
max := uint(0)
|
||||
found := false
|
||||
for k, w := range g.graph {
|
||||
if len(g.Previouses(k)) == 0 && w > max {
|
||||
rep = k
|
||||
max = w
|
||||
found = true
|
||||
}
|
||||
}
|
||||
|
||||
return rep, int(max), found
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) MaxPath() []uint64 {
|
||||
@ -185,17 +254,17 @@ func (g *DeBruijnGraph) MaxPath() []uint64 {
|
||||
ok := false
|
||||
idx := uint64(0)
|
||||
|
||||
idx, ok = g.MaxHead()
|
||||
idx, _, ok = g.MaxHead()
|
||||
|
||||
for ok {
|
||||
path = append(path, idx)
|
||||
idx, ok = g.MaxNext(idx)
|
||||
idx, _, ok = g.MaxNext(idx)
|
||||
}
|
||||
|
||||
return path
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) LongestPath() []uint64 {
|
||||
func (g *DeBruijnGraph) LongestPath(max_length int) []uint64 {
|
||||
var path []uint64
|
||||
wmax := uint(0)
|
||||
ok := true
|
||||
@ -209,7 +278,11 @@ func (g *DeBruijnGraph) LongestPath() []uint64 {
|
||||
nw := g.graph[idx]
|
||||
w += nw
|
||||
lp = append(lp, idx)
|
||||
idx, ok = g.MaxNext(idx)
|
||||
idx, _, ok = g.MaxNext(idx)
|
||||
if max_length > 0 && len(lp) > max_length {
|
||||
ok = false
|
||||
w = 0
|
||||
}
|
||||
}
|
||||
|
||||
if w > wmax {
|
||||
@ -221,8 +294,9 @@ func (g *DeBruijnGraph) LongestPath() []uint64 {
|
||||
return path
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error) {
|
||||
path := g.LongestPath()
|
||||
func (g *DeBruijnGraph) LongestConsensus(id string, max_length int) (*obiseq.BioSequence, error) {
|
||||
//path := g.LongestPath(max_length)
|
||||
path := g.HaviestPath()
|
||||
s := g.DecodePath(path)
|
||||
|
||||
if len(s) > 0 {
|
||||
@ -238,37 +312,10 @@ func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error)
|
||||
return nil, fmt.Errorf("cannot identify optimum path")
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) Heads() []uint64 {
|
||||
rep := make([]uint64, 0, 10)
|
||||
|
||||
for k := range g.graph {
|
||||
if len(g.Previouses(k)) == 0 {
|
||||
rep = append(rep, k)
|
||||
}
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) MaxHead() (uint64, bool) {
|
||||
rep := uint64(0)
|
||||
max := uint(0)
|
||||
found := false
|
||||
for k, w := range g.graph {
|
||||
if len(g.Previouses(k)) == 0 && w > max {
|
||||
rep = k
|
||||
found = true
|
||||
}
|
||||
}
|
||||
|
||||
return rep, found
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) DecodeNode(index uint64) string {
|
||||
rep := make([]byte, g.kmersize)
|
||||
index >>= 2
|
||||
for i := g.kmersize - 1; i >= 0; i-- {
|
||||
rep[i], _ = decode[index&3]
|
||||
rep[i] = decode[index&3]
|
||||
index >>= 2
|
||||
}
|
||||
|
||||
@ -282,7 +329,7 @@ func (g *DeBruijnGraph) DecodePath(path []uint64) string {
|
||||
if len(path) > 0 {
|
||||
buf.WriteString(g.DecodeNode(path[0]))
|
||||
|
||||
for _, idx := range path {
|
||||
for _, idx := range path[1:] {
|
||||
buf.WriteByte(decode[idx&3])
|
||||
}
|
||||
}
|
||||
@ -307,6 +354,13 @@ func (g *DeBruijnGraph) BestConsensus(id string) (*obiseq.BioSequence, error) {
|
||||
return nil, fmt.Errorf("cannot identify optimum path")
|
||||
}
|
||||
|
||||
// Weight returns the weight of the node at the given index in the DeBruijnGraph.
|
||||
//
|
||||
// Parameters:
|
||||
// - index: the index of the node in the graph.
|
||||
//
|
||||
// Returns:
|
||||
// - int: the weight of the node.
|
||||
func (g *DeBruijnGraph) Weight(index uint64) int {
|
||||
val, ok := g.graph[index]
|
||||
if !ok {
|
||||
@ -315,59 +369,64 @@ func (g *DeBruijnGraph) Weight(index uint64) int {
|
||||
return int(val)
|
||||
}
|
||||
|
||||
// append appends a sequence of nucleotides to the DeBruijnGraph.
|
||||
//
|
||||
// Parameters:
|
||||
// - sequence: a byte slice representing the sequence of nucleotides to append.
|
||||
// - current: the current node in the graph to which the sequence will be appended.
|
||||
// - weight: the weight of the added nodes.
|
||||
func (graph *DeBruijnGraph) append(sequence []byte, current uint64, weight int) {
|
||||
|
||||
for i := 0; i < len(sequence); i++ {
|
||||
current <<= 2
|
||||
current &= graph.kmermask
|
||||
b := iupac[sequence[i]]
|
||||
if len(b) == 1 {
|
||||
current |= b[0]
|
||||
graph.graph[current] = uint(graph.Weight(current) + weight)
|
||||
} else {
|
||||
for j := 0; j < len(b); j++ {
|
||||
current &= ^uint64(3)
|
||||
current |= b[j]
|
||||
if len(sequence) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
graph.graph[current] = uint(graph.Weight(current) + weight)
|
||||
graph.append(sequence[(i+1):], current, weight)
|
||||
}
|
||||
return
|
||||
}
|
||||
current <<= 2
|
||||
current &= graph.kmermask
|
||||
b := iupac[sequence[0]]
|
||||
current |= b[0]
|
||||
graph.graph[current] = uint(graph.Weight(current) + weight)
|
||||
graph.append(sequence[1:], current, weight)
|
||||
|
||||
for j := 1; j < len(b); j++ {
|
||||
current &= ^uint64(3)
|
||||
current |= b[j]
|
||||
graph.graph[current] = uint(graph.Weight(current) + weight)
|
||||
graph.append(sequence[1:], current, weight)
|
||||
}
|
||||
}
|
||||
|
||||
func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) {
|
||||
key := uint64(0)
|
||||
s := sequence.Sequence()
|
||||
w := sequence.Count()
|
||||
init := make([]uint64, 0, 16)
|
||||
var f func(start int, key uint64)
|
||||
f = func(start int, key uint64) {
|
||||
for i := start; i < graph.kmersize; i++ {
|
||||
key <<= 2
|
||||
b := iupac[s[i]]
|
||||
if len(b) == 1 {
|
||||
key |= b[0]
|
||||
} else {
|
||||
for j := 0; j < len(b); j++ {
|
||||
key &= ^uint64(3)
|
||||
key |= b[j]
|
||||
f(i+1, key)
|
||||
}
|
||||
return
|
||||
}
|
||||
s := sequence.Sequence() // Get the sequence as a byte slice
|
||||
w := sequence.Count() // Get the weight of the sequence
|
||||
|
||||
var initFirstKmer func(start int, key uint64)
|
||||
|
||||
// Initialize the first k-mer
|
||||
// start is the index of the nucleotide in the k-mer to add
|
||||
// key is the value of the k-mer index before adding the start nucleotide
|
||||
initFirstKmer = func(start int, key uint64) {
|
||||
if start == 0 {
|
||||
key = 0
|
||||
}
|
||||
|
||||
if start < graph.kmersize {
|
||||
key <<= 2
|
||||
b := iupac[s[start]]
|
||||
|
||||
for _, code := range b {
|
||||
key &= ^uint64(3)
|
||||
key |= code
|
||||
initFirstKmer(start+1, key)
|
||||
}
|
||||
} else {
|
||||
graph.graph[key] = uint(graph.Weight(key) + w)
|
||||
graph.append(s[graph.kmersize:], key, w)
|
||||
}
|
||||
init = append(init, key&graph.kmermask)
|
||||
}
|
||||
|
||||
if sequence.Len() > graph.kmersize {
|
||||
f(0, key)
|
||||
|
||||
for _, idx := range init {
|
||||
graph.append(s[graph.kmersize:], idx, w)
|
||||
}
|
||||
initFirstKmer(0, 0)
|
||||
}
|
||||
}
|
||||
|
||||
@ -381,43 +440,51 @@ func (graph *DeBruijnGraph) Gml() string {
|
||||
|
||||
`)
|
||||
|
||||
nodeidx := make(map[uint64]int)
|
||||
nodeid := 0
|
||||
|
||||
for idx := range graph.graph {
|
||||
node := graph.DecodeNode(idx)
|
||||
buffer.WriteString(
|
||||
fmt.Sprintf("node [ id \"%s\" ]\n", node),
|
||||
)
|
||||
n := graph.Nexts(uint64(idx))
|
||||
if len(n) == 0 {
|
||||
idx <<= 2
|
||||
idx &= graph.kmermask
|
||||
nodeid++
|
||||
nodeidx[idx] = nodeid
|
||||
n := graph.Nexts(idx)
|
||||
p := graph.Previouses(idx)
|
||||
|
||||
if len(n) == 0 || len(p) == 0 {
|
||||
|
||||
node := graph.DecodeNode(idx)
|
||||
buffer.WriteString(
|
||||
fmt.Sprintf("node [ id \"%s\" \n label \"%s\" ]\n", node, node),
|
||||
fmt.Sprintf("node [ id \"%d\" \n label \"%s\" ]\n", nodeid, node),
|
||||
)
|
||||
} else {
|
||||
buffer.WriteString(
|
||||
fmt.Sprintf("node [ id \"%d\" ]\n", nodeid),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
for idx, weight := range graph.graph {
|
||||
src := graph.DecodeNode(idx)
|
||||
label := decode[idx&3]
|
||||
idx <<= 2
|
||||
idx &= graph.kmermask
|
||||
dst := graph.DecodeNode(idx)
|
||||
|
||||
buffer.WriteString(
|
||||
fmt.Sprintf(`edge [ source "%s"
|
||||
target "%s"
|
||||
color "#00FF00"
|
||||
label "%c[%d]"
|
||||
graphics [
|
||||
width %f
|
||||
arrow "last"
|
||||
fill "#007F80"
|
||||
for idx := range graph.graph {
|
||||
srcid := nodeidx[idx]
|
||||
n := graph.Nexts(idx)
|
||||
for _, dst := range n {
|
||||
dstid := nodeidx[dst]
|
||||
label := decode[dst&3]
|
||||
weight := graph.Weight(dst)
|
||||
buffer.WriteString(
|
||||
fmt.Sprintf(`edge [ source "%d"
|
||||
target "%d"
|
||||
color "#00FF00"
|
||||
label "%c[%d]"
|
||||
graphics [
|
||||
width %f
|
||||
arrow "last"
|
||||
fill "#007F80"
|
||||
]
|
||||
]
|
||||
]
|
||||
|
||||
`, src, dst, label, weight, math.Log(float64(weight))),
|
||||
)
|
||||
|
||||
`, srcid, dstid, label, weight, math.Sqrt(float64(weight))),
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
buffer.WriteString("]\n")
|
||||
return buffer.String()
|
||||
@ -472,3 +539,151 @@ func (g *DeBruijnGraph) HammingDistance(kmer1, kmer2 uint64) int {
|
||||
ident &= 0x5555555555555555 & g.kmermask
|
||||
return bits.OnesCount64(ident)
|
||||
}
|
||||
|
||||
type UInt64Heap []uint64
|
||||
|
||||
func (h UInt64Heap) Len() int { return len(h) }
|
||||
func (h UInt64Heap) Less(i, j int) bool { return h[i] < h[j] }
|
||||
func (h UInt64Heap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
||||
|
||||
func (h *UInt64Heap) Push(x any) {
|
||||
// Push and Pop use pointer receivers because they modify the slice's length,
|
||||
// not just its contents.
|
||||
*h = append(*h, x.(uint64))
|
||||
}
|
||||
|
||||
func (h *UInt64Heap) Pop() any {
|
||||
old := *h
|
||||
n := len(old)
|
||||
x := old[n-1]
|
||||
*h = old[0 : n-1]
|
||||
return x
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) HaviestPath() []uint64 {
|
||||
|
||||
if g.HasCycle() {
|
||||
return nil
|
||||
}
|
||||
// Initialize the distance array and visited set
|
||||
distances := make(map[uint64]int)
|
||||
visited := make(map[uint64]bool)
|
||||
prevNodes := make(map[uint64]uint64)
|
||||
heaviestNode := uint64(0)
|
||||
heaviestWeight := 0
|
||||
|
||||
queue := &UInt64Heap{}
|
||||
heap.Init(queue)
|
||||
|
||||
startNodes := make(map[uint64]struct{})
|
||||
for _, n := range g.Heads() {
|
||||
startNodes[n] = struct{}{}
|
||||
heap.Push(queue, n)
|
||||
distances[n] = g.Weight(n)
|
||||
prevNodes[n] = 0
|
||||
visited[n] = false
|
||||
}
|
||||
|
||||
// Priority queue to keep track of nodes to visit
|
||||
for len(*queue) > 0 {
|
||||
// Get the node with the smallest distance
|
||||
currentNode := heap.Pop(queue).(uint64)
|
||||
|
||||
// If the current node has already been visited, skip it
|
||||
if visited[currentNode] {
|
||||
continue
|
||||
}
|
||||
|
||||
// Mark the node as visited
|
||||
visited[currentNode] = true
|
||||
weight := distances[currentNode]
|
||||
|
||||
// Update the heaviest node
|
||||
if weight > heaviestWeight {
|
||||
heaviestWeight = weight
|
||||
heaviestNode = currentNode
|
||||
}
|
||||
|
||||
if currentNode == 0 {
|
||||
log.Warn("current node is 0")
|
||||
}
|
||||
// Update the distance of the neighbors
|
||||
nextNodes := g.Nexts(currentNode)
|
||||
for _, nextNode := range nextNodes {
|
||||
if nextNode == 0 {
|
||||
log.Warn("next node is 0")
|
||||
}
|
||||
weight := g.Weight(nextNode) + distances[currentNode]
|
||||
if distances[nextNode] < weight {
|
||||
distances[nextNode] = weight
|
||||
prevNodes[nextNode] = currentNode
|
||||
visited[nextNode] = false
|
||||
heap.Push(queue, nextNode)
|
||||
|
||||
// Keep track of the node with the heaviest weight
|
||||
if weight > heaviestWeight {
|
||||
heaviestWeight = weight
|
||||
heaviestNode = nextNode
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Heaviest node: %d [%v]", heaviestNode, heaviestWeight)
|
||||
// Reconstruct the path from the start node to the heaviest node found
|
||||
heaviestPath := make([]uint64, 0)
|
||||
currentNode := heaviestNode
|
||||
for _, ok := startNodes[currentNode]; !ok && !slices.Contains(heaviestPath, currentNode); _, ok = startNodes[currentNode] {
|
||||
heaviestPath = append(heaviestPath, currentNode)
|
||||
//log.Infof("Current node: %d <- %d", currentNode, prevNodes[currentNode])
|
||||
currentNode = prevNodes[currentNode]
|
||||
}
|
||||
|
||||
if slices.Contains(heaviestPath, currentNode) {
|
||||
log.Fatalf("Cycle detected %v -> %v (%v) len(%v)", heaviestPath, currentNode, startNodes, len(heaviestPath))
|
||||
return nil
|
||||
}
|
||||
|
||||
heaviestPath = append(heaviestPath, currentNode)
|
||||
|
||||
// Reverse the path
|
||||
slices.Reverse(heaviestPath)
|
||||
|
||||
return heaviestPath
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) HasCycle() bool {
|
||||
// Initialize the visited and stack arrays
|
||||
visited := make(map[uint64]bool)
|
||||
stack := make(map[uint64]bool)
|
||||
|
||||
// Helper function to perform DFS
|
||||
var dfs func(node uint64) bool
|
||||
dfs = func(node uint64) bool {
|
||||
visited[node] = true
|
||||
stack[node] = true
|
||||
|
||||
nextNodes := g.Nexts(node)
|
||||
for _, nextNode := range nextNodes {
|
||||
if !visited[nextNode] {
|
||||
if dfs(nextNode) {
|
||||
return true
|
||||
}
|
||||
} else if stack[nextNode] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
stack[node] = false
|
||||
return false
|
||||
}
|
||||
|
||||
// Perform DFS on each node to check for cycles
|
||||
for node := range g.graph {
|
||||
if !visited[node] {
|
||||
if dfs(node) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
@ -130,7 +130,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
|
||||
|
||||
sseq := sequence.String()
|
||||
direct := sseq[start:end]
|
||||
tagstart := obiutils.MaxInt(start-marker.taglength, 0)
|
||||
tagstart := obiutils.Max(start-marker.taglength, 0)
|
||||
ftag := strings.ToLower(sseq[tagstart:start])
|
||||
|
||||
m := DemultiplexMatch{
|
||||
@ -150,7 +150,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
|
||||
reverse, _ := sequence.Subsequence(start, end, false)
|
||||
defer reverse.Recycle()
|
||||
reverse = reverse.ReverseComplement(true)
|
||||
endtag := obiutils.MinInt(end+marker.taglength, sequence.Len())
|
||||
endtag := obiutils.Min(end+marker.taglength, sequence.Len())
|
||||
rtag, err := sequence.Subsequence(end, endtag, false)
|
||||
defer rtag.Recycle()
|
||||
srtag := ""
|
||||
@ -201,7 +201,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
|
||||
sseq := sequence.String()
|
||||
|
||||
reverse := strings.ToLower(sseq[start:end])
|
||||
tagstart := obiutils.MaxInt(start-marker.taglength, 0)
|
||||
tagstart := obiutils.Max(start-marker.taglength, 0)
|
||||
rtag := strings.ToLower(sseq[tagstart:start])
|
||||
|
||||
m := DemultiplexMatch{
|
||||
@ -221,7 +221,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
|
||||
defer direct.Recycle()
|
||||
direct = direct.ReverseComplement(true)
|
||||
|
||||
endtag := obiutils.MinInt(end+marker.taglength, sequence.Len())
|
||||
endtag := obiutils.Min(end+marker.taglength, sequence.Len())
|
||||
ftag, err := sequence.Subsequence(end, endtag, false)
|
||||
defer ftag.Recycle()
|
||||
sftag := ""
|
||||
|
@ -38,13 +38,7 @@ import (
|
||||
// return float64(m)
|
||||
// }
|
||||
|
||||
// func minIntVector(values []int) float64 {
|
||||
// m := values[0]
|
||||
// for _, v := range values {
|
||||
// if v < m {
|
||||
// m = v
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
// return float64(m)
|
||||
// }
|
||||
|
@ -27,7 +27,7 @@ func SuffixLess(suffixarray SuffixArray) func(i, j int) bool {
|
||||
sj := suffixarray.Suffixes[j]
|
||||
bj := (*suffixarray.Sequences)[int(sj.Idx)].Sequence()[sj.Pos:]
|
||||
|
||||
l := obiutils.MinInt(len(bi), len(bj))
|
||||
l := obiutils.Min(len(bi), len(bj))
|
||||
p := 0
|
||||
for p < l && bi[p] == bj[p] {
|
||||
p++
|
||||
@ -92,7 +92,7 @@ func (suffixarray *SuffixArray) CommonSuffix() []int {
|
||||
si := suffixarray.Suffixes[i]
|
||||
bi := (*suffixarray.Sequences)[int(si.Idx)].Sequence()[si.Pos:]
|
||||
|
||||
l := obiutils.MinInt(len(bi), len(bp))
|
||||
l := obiutils.Min(len(bi), len(bp))
|
||||
p := 0
|
||||
for p < l && bi[p] == bp[p] {
|
||||
p++
|
||||
|
@ -4,92 +4,93 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"sort"
|
||||
"slices"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obisuffix"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||
consensus_id string,
|
||||
kmer_size int, quorum float64,
|
||||
min_depth float64,
|
||||
max_length int,
|
||||
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
|
||||
|
||||
if save_graph {
|
||||
if dirname == "" {
|
||||
dirname = "."
|
||||
}
|
||||
|
||||
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
|
||||
// path does not exist or is not directory
|
||||
os.RemoveAll(dirname)
|
||||
err := os.Mkdir(dirname, 0755)
|
||||
|
||||
if err != nil {
|
||||
log.Panicf("Cannot create directory %s for saving graphs", dirname)
|
||||
}
|
||||
}
|
||||
|
||||
fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s.fasta", consensus_id)))
|
||||
|
||||
if err == nil {
|
||||
defer fasta.Close()
|
||||
fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch(0, seqs), obiformats.FormatFastSeqJsonHeader, false))
|
||||
fasta.Close()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
log.Printf("Number of reads : %d\n", len(seqs))
|
||||
|
||||
if kmer_size < 0 {
|
||||
longest := make([]int, len(seqs))
|
||||
|
||||
for i := range seqs {
|
||||
s := seqs[i : i+1]
|
||||
for i, seq := range seqs {
|
||||
s := obiseq.BioSequenceSlice{seq}
|
||||
sa := obisuffix.BuildSuffixArray(&s)
|
||||
longest[i] = obiutils.MaxSlice(sa.CommonSuffix())
|
||||
longest[i] = slices.Max(sa.CommonSuffix())
|
||||
}
|
||||
|
||||
o := obiutils.Order(sort.IntSlice(longest))
|
||||
i := int(float64(len(seqs)) * quorum)
|
||||
// o := obiutils.Order(sort.IntSlice(longest))
|
||||
// i := int(float64(len(seqs)) * quorum)
|
||||
|
||||
kmer_size = longest[o[i]] + 1
|
||||
// if i >= len(o) {
|
||||
// i = len(o) - 1
|
||||
// }
|
||||
|
||||
kmer_size = slices.Max(longest) + 1
|
||||
|
||||
// kmer_size = longest[o[i]] + 1
|
||||
log.Printf("estimated kmer size : %d", kmer_size)
|
||||
}
|
||||
|
||||
graph := obikmer.MakeDeBruijnGraph(kmer_size)
|
||||
var graph *obikmer.DeBruijnGraph
|
||||
for {
|
||||
graph = obikmer.MakeDeBruijnGraph(kmer_size)
|
||||
|
||||
for _, s := range seqs {
|
||||
graph.Push(s)
|
||||
for _, s := range seqs {
|
||||
graph.Push(s)
|
||||
}
|
||||
|
||||
if !graph.HasCycle() {
|
||||
break
|
||||
}
|
||||
|
||||
kmer_size++
|
||||
log.Infof("Cycle detected, increasing kmer size to %d\n", kmer_size)
|
||||
}
|
||||
|
||||
log.Printf("Graph size : %d\n", graph.Len())
|
||||
total_kmer := graph.Len()
|
||||
|
||||
threshold := 0
|
||||
|
||||
switch {
|
||||
case min_depth < 0:
|
||||
spectrum := graph.LinkSpectrum()
|
||||
cum := make(map[int]int)
|
||||
|
||||
spectrum[1] = 0
|
||||
for i := 2; i < len(spectrum); i++ {
|
||||
spectrum[i] += spectrum[i-1]
|
||||
cum[spectrum[i]]++
|
||||
}
|
||||
|
||||
max := 0
|
||||
kmax := 0
|
||||
for k, obs := range cum {
|
||||
if obs > max {
|
||||
max = obs
|
||||
kmax = k
|
||||
}
|
||||
}
|
||||
|
||||
for i, total := range spectrum {
|
||||
if total == kmax {
|
||||
threshold = i
|
||||
break
|
||||
}
|
||||
}
|
||||
threshold /= 2
|
||||
case min_depth >= 1:
|
||||
threshold = int(min_depth)
|
||||
default:
|
||||
threshold = int(float64(len(seqs)) * min_depth)
|
||||
}
|
||||
|
||||
graph.FilterMin(threshold)
|
||||
|
||||
log.Printf("Graph size : %d\n", graph.Len())
|
||||
|
||||
if save_graph {
|
||||
|
||||
file, err := os.Create(path.Join(dirname,
|
||||
fmt.Sprintf("%s.gml", seqs[0].Source())))
|
||||
fmt.Sprintf("%s_raw_consensus.gml", consensus_id)))
|
||||
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
@ -99,29 +100,133 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||
}
|
||||
}
|
||||
|
||||
id := seqs[0].Source()
|
||||
if id == "" {
|
||||
id = seqs[0].Id()
|
||||
}
|
||||
seq, err := graph.LongestConsensus(id)
|
||||
log.Printf("Graph size : %d\n", graph.Len())
|
||||
total_kmer := graph.Len()
|
||||
|
||||
// threshold := 0
|
||||
|
||||
// switch {
|
||||
// case min_depth < 0:
|
||||
// spectrum := graph.WeightSpectrum()
|
||||
// cum := make(map[int]int)
|
||||
|
||||
// spectrum[1] = 0
|
||||
// for i := 2; i < len(spectrum); i++ {
|
||||
// spectrum[i] += spectrum[i-1]
|
||||
// cum[spectrum[i]]++
|
||||
// }
|
||||
|
||||
// max := 0
|
||||
// kmax := 0
|
||||
// for k, obs := range cum {
|
||||
// if obs > max {
|
||||
// max = obs
|
||||
// kmax = k
|
||||
// }
|
||||
// }
|
||||
|
||||
// for i, total := range spectrum {
|
||||
// if total == kmax {
|
||||
// threshold = i
|
||||
// break
|
||||
// }
|
||||
// }
|
||||
// threshold /= 2
|
||||
|
||||
// if threshold < 1 {
|
||||
// threshold = 1
|
||||
// }
|
||||
|
||||
// log.Info("Estimated kmer_min_occur = ", threshold)
|
||||
// case min_depth >= 1:
|
||||
// threshold = int(min_depth)
|
||||
// default:
|
||||
// threshold = int(float64(len(seqs)) * min_depth)
|
||||
// }
|
||||
|
||||
// graph.FilterMinWeight(threshold)
|
||||
|
||||
// log.Printf("Graph size : %d\n", graph.Len())
|
||||
|
||||
// if save_graph {
|
||||
|
||||
// file, err := os.Create(path.Join(dirname,
|
||||
// fmt.Sprintf("%s_consensus.gml", consensus_id)))
|
||||
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// } else {
|
||||
// file.WriteString(graph.Gml())
|
||||
// file.Close()
|
||||
// }
|
||||
// }
|
||||
|
||||
seq, err := graph.LongestConsensus(consensus_id, max_length)
|
||||
|
||||
sumCount := 0
|
||||
|
||||
for _, s := range seqs {
|
||||
sumCount += s.Count()
|
||||
if seq != nil {
|
||||
for _, s := range seqs {
|
||||
sumCount += s.Count()
|
||||
}
|
||||
|
||||
seq.SetCount(sumCount)
|
||||
seq.SetAttribute("seq_length", seq.Len())
|
||||
seq.SetAttribute("kmer_size", kmer_size)
|
||||
//seq.SetAttribute("kmer_min_occur", threshold)
|
||||
seq.SetAttribute("kmer_max_occur", graph.MaxWeight())
|
||||
seq.SetAttribute("filtered_graph_size", graph.Len())
|
||||
seq.SetAttribute("full_graph_size", total_kmer)
|
||||
}
|
||||
|
||||
seq.SetCount(sumCount)
|
||||
seq.SetAttribute("seq_length", seq.Len())
|
||||
seq.SetAttribute("kmer_size", kmer_size)
|
||||
seq.SetAttribute("kmer_min_occur", threshold)
|
||||
seq.SetAttribute("kmer_max_occur", graph.MaxLink())
|
||||
seq.SetAttribute("filtered_graph_size", graph.Len())
|
||||
seq.SetAttribute("full_graph_size", total_kmer)
|
||||
|
||||
return seq, err
|
||||
}
|
||||
|
||||
// func BuildConsensusWithTimeout(seqs obiseq.BioSequenceSlice,
|
||||
// kmer_size int, quorum float64,
|
||||
// min_depth float64,
|
||||
// save_graph bool, dirname string, timeout time.Duration) (*obiseq.BioSequence, error) {
|
||||
|
||||
// ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
// defer cancel()
|
||||
|
||||
// consensus := func() *obiseq.BioSequence {
|
||||
// cons, err := BuildConsensus(seqs, kmer_size, quorum, min_depth, save_graph, dirname,)
|
||||
// if err != nil {
|
||||
// cons = nil
|
||||
// }
|
||||
|
||||
// return cons
|
||||
// }
|
||||
|
||||
// computation := func() <-chan *obiseq.BioSequence {
|
||||
// result := make(chan *obiseq.BioSequence)
|
||||
|
||||
// go func() {
|
||||
// select {
|
||||
// case <-ctx.Done():
|
||||
// result <- nil
|
||||
// default:
|
||||
// result <- consensus()
|
||||
|
||||
// }
|
||||
// }()
|
||||
|
||||
// return result
|
||||
// }
|
||||
|
||||
// calcResult := computation()
|
||||
|
||||
// select {
|
||||
// case result := <-calcResult:
|
||||
// if result == nil {
|
||||
// return nil, fmt.Errorf("cannot compute consensus")
|
||||
// }
|
||||
// return result, nil
|
||||
// case <-ctx.Done():
|
||||
// return nil, fmt.Errorf("compute consensus timeout, exiting")
|
||||
// }
|
||||
// }
|
||||
|
||||
func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
size := 10
|
||||
@ -153,10 +258,19 @@ func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
for iterator.Next() {
|
||||
seqs := iterator.Get()
|
||||
|
||||
consensus, err := BuildConsensus(seqs.Slice(),
|
||||
sequences := seqs.Slice()
|
||||
|
||||
id := sequences[0].Source()
|
||||
if id == "" {
|
||||
id = sequences[0].Id()
|
||||
}
|
||||
consensus, err := BuildConsensus(sequences,
|
||||
id,
|
||||
CLIKmerSize(), CLIThreshold(),
|
||||
CLIKmerDepth(),
|
||||
CLISaveGraphToFiles(), CLIGraphFilesDirectory(),
|
||||
CLIMaxConsensusLength(),
|
||||
CLISaveGraphToFiles(),
|
||||
CLIGraphFilesDirectory(),
|
||||
)
|
||||
|
||||
if err == nil {
|
||||
|
@ -9,6 +9,7 @@ var _saveGraph = "__@@NOSAVE@@__"
|
||||
var _kmerSize = -1
|
||||
var _threshold = 0.99
|
||||
var _mindepth = -1.0
|
||||
var _consensus_max_length = -1
|
||||
|
||||
func ObiconsensusOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
@ -38,6 +39,12 @@ func ObiconsensusOptionSet(options *getoptions.GetOpt) {
|
||||
"Default value = -1, which means that the DEPTH is estimated from the data"),
|
||||
)
|
||||
|
||||
options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length,
|
||||
options.ArgName("LENGTH"),
|
||||
options.Description("Maximum length of the consensus sequence. "+
|
||||
"Default value = -1, which means that no limit is applied"),
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
@ -67,3 +74,7 @@ func CLIKmerDepth() float64 {
|
||||
func CLIThreshold() float64 {
|
||||
return _threshold
|
||||
}
|
||||
|
||||
func CLIMaxConsensusLength() int {
|
||||
return _consensus_max_length
|
||||
}
|
||||
|
290
pkg/obitools/obiminion/obiminion.go
Normal file
290
pkg/obitools/obiminion/obiminion.go
Normal file
@ -0,0 +1,290 @@
|
||||
package obiminion
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obigraph"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiannotate"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiuniq"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// SampleWeight calculates the weight of a sample based on the statistics of a sequence.
|
||||
//
|
||||
// Parameters:
|
||||
// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice)
|
||||
// - sample: the sample for which the weight is calculated (string)
|
||||
// - sample_key: the key used to access the sample's statistics (string)
|
||||
// Return type: a function that takes an integer index and returns the weight of the sample at that index (func(int) int)
|
||||
func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func(int) float64 {
|
||||
|
||||
f := func(i int) float64 {
|
||||
|
||||
stats := (*seqs)[i].StatsOn(sample_key, "NA")
|
||||
|
||||
if value, ok := stats[sample]; ok {
|
||||
return float64(value)
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
// SeqBySamples sorts the sequences by samples.
|
||||
//
|
||||
// Parameters:
|
||||
// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice)
|
||||
// - sample_key: a string representing the sample key (string)
|
||||
//
|
||||
// Return type:
|
||||
// - map[string]BioSequenceSlice: a map indexed by sample names, each containing a slice of BioSequence objects (map[string]BioSequenceSlice)
|
||||
func SeqBySamples(seqs obiseq.BioSequenceSlice, sample_key string) map[string]*obiseq.BioSequenceSlice {
|
||||
|
||||
samples := make(map[string]*obiseq.BioSequenceSlice)
|
||||
|
||||
for _, s := range seqs {
|
||||
if s.HasStatsOn(sample_key) {
|
||||
stats := s.StatsOn(sample_key, "NA")
|
||||
for k := range stats {
|
||||
if seqset, ok := samples[k]; ok {
|
||||
*seqset = append(*seqset, s)
|
||||
samples[k] = seqset
|
||||
} else {
|
||||
samples[k] = &obiseq.BioSequenceSlice{s}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if k, ok := s.GetStringAttribute(sample_key); ok {
|
||||
if seqset, ok := samples[k]; ok {
|
||||
*seqset = append(*seqset, s)
|
||||
samples[k] = seqset
|
||||
} else {
|
||||
samples[k] = &obiseq.BioSequenceSlice{s}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return samples
|
||||
|
||||
}
|
||||
|
||||
type Mutation struct {
|
||||
Position int
|
||||
SeqA byte
|
||||
SeqB byte
|
||||
Ratio float64
|
||||
}
|
||||
|
||||
func BuildDiffSeqGraph(name, name_key string,
|
||||
seqs *obiseq.BioSequenceSlice,
|
||||
distmax, nworkers int) *obigraph.Graph[*obiseq.BioSequence, Mutation] {
|
||||
graph := obigraph.NewGraphBuffer[*obiseq.BioSequence, Mutation](name, (*[]*obiseq.BioSequence)(seqs))
|
||||
iseq := make(chan int)
|
||||
defer graph.Close()
|
||||
|
||||
ls := len(*seqs)
|
||||
|
||||
sw := SampleWeight(seqs, name, name_key)
|
||||
graph.Graph.VertexWeight = sw
|
||||
|
||||
waiting := sync.WaitGroup{}
|
||||
waiting.Add(nworkers)
|
||||
|
||||
bar := (*progressbar.ProgressBar)(nil)
|
||||
if obiconvert.CLIProgressBar() {
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription(fmt.Sprintf("[Build graph] on %s", name)),
|
||||
)
|
||||
|
||||
bar = progressbar.NewOptions(len(*seqs), pbopt...)
|
||||
}
|
||||
|
||||
computeEdges := func() {
|
||||
defer waiting.Done()
|
||||
for i := range iseq {
|
||||
s1 := (*seqs)[i]
|
||||
for j := i + 1; j < ls; j++ {
|
||||
s2 := (*seqs)[j]
|
||||
ratio := sw(i) / sw(j)
|
||||
ok, pos, a1, a2 := obialign.D1Or0(s1, s2)
|
||||
if ok >= 0 {
|
||||
graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio})
|
||||
} else if distmax > 1 {
|
||||
lcs, lali := obialign.FastLCSScore(s1, s2, distmax, nil)
|
||||
dist := lali - lcs
|
||||
if lcs > 0 && dist <= distmax {
|
||||
// log.Infof("Seq %s and %s: LCSScore: %d, dist: %d\n", s1.Id(), s2.Id(), lcs, dist)
|
||||
graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for i := 0; i < nworkers; i++ {
|
||||
go computeEdges()
|
||||
}
|
||||
|
||||
for i := 0; i < ls; i++ {
|
||||
iseq <- i
|
||||
}
|
||||
close(iseq)
|
||||
|
||||
waiting.Wait()
|
||||
return graph.Graph
|
||||
}
|
||||
|
||||
func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
||||
sample_key string, kmer_size int, max_length int, threshold float64, depth float64) obiseq.BioSequenceSlice {
|
||||
denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices))
|
||||
|
||||
for i, v := range *graph.Vertices {
|
||||
var err error
|
||||
var clean *obiseq.BioSequence
|
||||
degree := graph.Degree(i)
|
||||
if degree > 4 {
|
||||
pack := obiseq.MakeBioSequenceSlice(degree + 1)
|
||||
for k,j := range graph.Neighbors(i) {
|
||||
pack[k] = (*graph.Vertices)[j]
|
||||
}
|
||||
pack[degree] = v
|
||||
clean, err = obiconsensus.BuildConsensus(pack,
|
||||
fmt.Sprintf("%s_consensus", v.Id()),
|
||||
kmer_size,
|
||||
threshold,
|
||||
depth, max_length,
|
||||
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
|
||||
|
||||
if err != nil {
|
||||
log.Warning(err)
|
||||
clean = (*graph.Vertices)[i]
|
||||
clean.SetAttribute("obiminion_consensus", false)
|
||||
} else {
|
||||
clean.SetAttribute("obiminion_consensus", true)
|
||||
}
|
||||
pack.Recycle(false)
|
||||
} else {
|
||||
clean = obiseq.NewBioSequence(v.Id(), v.Sequence(), v.Definition())
|
||||
clean.SetAttribute("obiminion_consensus", false)
|
||||
}
|
||||
|
||||
clean.SetCount(int(graph.VertexWeight(i)))
|
||||
clean.SetAttribute(sample_key, graph.Name)
|
||||
|
||||
denoised[i] = clean
|
||||
}
|
||||
|
||||
return denoised
|
||||
}
|
||||
func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
dirname := CLIGraphFilesDirectory()
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
db := itertator.Load()
|
||||
|
||||
log.Infof("Sequence dataset of %d sequeences loaded\n", len(db))
|
||||
|
||||
samples := SeqBySamples(db, CLISampleAttribute())
|
||||
db.Recycle(false)
|
||||
|
||||
log.Infof("Dataset composed of %d samples\n", len(samples))
|
||||
if CLIMaxConsensusLength() > 0 {
|
||||
log.Infof("Maximum consensus length: %d\n", CLIMaxConsensusLength())
|
||||
}
|
||||
|
||||
log.Infof("Dataset composed of %d samples\n", len(samples))
|
||||
|
||||
if CLISaveGraphToFiles() {
|
||||
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
|
||||
// path does not exist or is not directory
|
||||
os.RemoveAll(dirname)
|
||||
err := os.Mkdir(dirname, 0755)
|
||||
|
||||
if err != nil {
|
||||
log.Panicf("Cannot create directory %s for saving graphs", dirname)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bar := (*progressbar.ProgressBar)(nil)
|
||||
if obiconvert.CLIProgressBar() {
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription("[Filter graph on abundance ratio]"),
|
||||
)
|
||||
|
||||
bar = progressbar.NewOptions(len(samples), pbopt...)
|
||||
}
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
sample_order := 0
|
||||
for sample, seqs := range samples {
|
||||
graph := BuildDiffSeqGraph(sample,
|
||||
CLISampleAttribute(),
|
||||
seqs,
|
||||
CLIDistStepMax(),
|
||||
obioptions.CLIParallelWorkers())
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
|
||||
if CLISaveGraphToFiles() {
|
||||
graph.WriteGmlFile(fmt.Sprintf("%s/%s.gml",
|
||||
CLIGraphFilesDirectory(),
|
||||
sample),
|
||||
false, 1, 0, 3)
|
||||
}
|
||||
|
||||
denoised := MinionDenoise(graph,
|
||||
CLISampleAttribute(),
|
||||
CLIKmerSize(),
|
||||
CLIMaxConsensusLength(),
|
||||
CLIThreshold(),
|
||||
CLIKmerDepth())
|
||||
|
||||
newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised))
|
||||
|
||||
sample_order++
|
||||
}
|
||||
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
obiuniq.AddStatsOn(CLISampleAttribute())
|
||||
obiuniq.SetUniqueInMemory(false)
|
||||
obiuniq.SetNoSingleton(CLINoSingleton())
|
||||
return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
|
||||
}
|
179
pkg/obitools/obiminion/options.go
Normal file
179
pkg/obitools/obiminion/options.go
Normal file
@ -0,0 +1,179 @@
|
||||
package obiminion
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _distStepMax = 1
|
||||
var _sampleAttribute = "sample"
|
||||
|
||||
var _ratioMax = 1.0
|
||||
var _minEvalRate = 1000
|
||||
|
||||
var _clusterMode = false
|
||||
var _onlyHead = false
|
||||
|
||||
var _kmerSize = -1
|
||||
var _threshold = 1.0
|
||||
var _mindepth = -1.0
|
||||
|
||||
var _consensus_max_length = 1000
|
||||
|
||||
var _NoSingleton = false
|
||||
|
||||
var _saveGraph = "__@@NOSAVE@@__"
|
||||
var _saveRatio = "__@@NOSAVE@@__"
|
||||
|
||||
// ObiminionOptionSet sets the options for obiminion.
|
||||
//
|
||||
// options: The options for configuring obiminion.
|
||||
func ObiminionOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
|
||||
options.Alias("s"),
|
||||
options.Description("Attribute containing sample descriptions (default %s)."))
|
||||
|
||||
options.IntVar(&_distStepMax, "distance", _distStepMax,
|
||||
options.Alias("d"),
|
||||
options.Description("Maximum numbers of differences between two variant sequences (default: %d)."))
|
||||
|
||||
options.IntVar(&_minEvalRate, "min-eval-rate", _minEvalRate,
|
||||
options.Description("Minimum abundance of a sequence to be used to evaluate mutation rate."))
|
||||
|
||||
options.StringVar(&_saveGraph, "save-graph", _saveGraph,
|
||||
options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+
|
||||
"The graph files follow the graphml format."),
|
||||
)
|
||||
|
||||
options.StringVar(&_saveRatio, "save-ratio", _saveRatio,
|
||||
options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+
|
||||
"The ratio file follows the csv format."),
|
||||
)
|
||||
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
|
||||
options.ArgName("SIZE"),
|
||||
options.Description("The size of the kmer used to build the consensus. "+
|
||||
"Default value = -1, which means that the kmer size is estimated from the data"),
|
||||
)
|
||||
|
||||
options.Float64Var(&_threshold, "threshold", _threshold,
|
||||
options.ArgName("RATIO"),
|
||||
options.Description("A threshold between O and 1 used to determine the optimal "+
|
||||
"kmer size"),
|
||||
)
|
||||
|
||||
options.Float64Var(&_mindepth, "min-depth", _mindepth,
|
||||
options.ArgName("DEPTH"),
|
||||
options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+
|
||||
"reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+
|
||||
"than 1, indicate the minimum count of occurrence for a kmer to be kept. "+
|
||||
"Default value = -1, which means that the DEPTH is estimated from the data"),
|
||||
)
|
||||
|
||||
options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length,
|
||||
options.ArgName("LENGTH"),
|
||||
options.Description("Maximum length of the consensus sequence. "+
|
||||
"Default value = -1, which means that no limit is applied"),
|
||||
)
|
||||
|
||||
options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton,
|
||||
options.Description("If set, sequences occurring a single time in the data set are discarded."))
|
||||
|
||||
|
||||
}
|
||||
|
||||
// OptionSet sets up the options for the obiminion package.
|
||||
//
|
||||
// It takes a pointer to a getoptions.GetOpt object as a parameter.
|
||||
// It does not return any value.
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.InputOptionSet(options)
|
||||
obiconvert.OutputOptionSet(options)
|
||||
ObiminionOptionSet(options)
|
||||
}
|
||||
|
||||
// CLIDistStepMax returns the maximum distance between two sequences.
|
||||
//
|
||||
// The value of the distance is set by the user with the `-d` flag.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns an integer.
|
||||
func CLIDistStepMax() int {
|
||||
return _distStepMax
|
||||
}
|
||||
|
||||
// CLISampleAttribute returns the name of the attribute used to store sample name.
|
||||
//
|
||||
// The value of the sample attribute is set by the user with the `-s` flag.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns a string.
|
||||
func CLISampleAttribute() string {
|
||||
return _sampleAttribute
|
||||
}
|
||||
|
||||
// > The function `CLIMinCountToEvalMutationRate()` returns the minimum number of reads that must be
|
||||
// observed before the mutation rate can be evaluated
|
||||
func CLIMinCountToEvalMutationRate() int {
|
||||
return _minEvalRate
|
||||
}
|
||||
|
||||
func ClusterMode() bool {
|
||||
return _clusterMode
|
||||
}
|
||||
|
||||
// `OnlyHead()` returns a boolean value that indicates whether the `-h` flag was passed to the program
|
||||
func OnlyHead() bool {
|
||||
return _onlyHead
|
||||
}
|
||||
|
||||
// Returns true it the obliclean graphs must be saved
|
||||
func CLISaveGraphToFiles() bool {
|
||||
return _saveGraph != "__@@NOSAVE@@__"
|
||||
}
|
||||
|
||||
// It returns the directory where the graph files are saved
|
||||
func CLIGraphFilesDirectory() string {
|
||||
return _saveGraph
|
||||
}
|
||||
|
||||
// Returns true it the table of ratio must be saved
|
||||
func IsSaveRatioTable() bool {
|
||||
return _saveRatio != "__@@NOSAVE@@__"
|
||||
}
|
||||
|
||||
// It returns the filename of the file that stores the ratio table
|
||||
func RatioTableFilename() string {
|
||||
return _saveRatio
|
||||
}
|
||||
|
||||
// CLIKmerSize returns the value of the kmer size to use for building the consensus.
|
||||
//
|
||||
// The value of the kmer size is set by the user with the `-k` flag.
|
||||
// The value -1 means that the kmer size is estimated as the minimum value that
|
||||
// insure that no kmer are present more than one time in a sequence.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns an integer value.
|
||||
func CLIKmerSize() int {
|
||||
return _kmerSize
|
||||
}
|
||||
|
||||
func CLIKmerDepth() float64 {
|
||||
return _mindepth
|
||||
}
|
||||
|
||||
func CLIThreshold() float64 {
|
||||
return _threshold
|
||||
}
|
||||
|
||||
func CLIMaxConsensusLength() int {
|
||||
return _consensus_max_length
|
||||
}
|
||||
|
||||
// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns a boolean value indicating whether or not singleton sequences should be discarded.
|
||||
func CLINoSingleton() bool {
|
||||
return _NoSingleton
|
||||
}
|
@ -47,8 +47,8 @@ func CLIPCR(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error) {
|
||||
frags := obiiter.IFragments(
|
||||
CLIMaxLength()*1000,
|
||||
CLIMaxLength()*100,
|
||||
CLIMaxLength()+obiutils.MaxInt(len(CLIForwardPrimer()),
|
||||
len(CLIReversePrimer()))+obiutils.MinInt(len(CLIForwardPrimer()),
|
||||
CLIMaxLength()+obiutils.Max(len(CLIForwardPrimer()),
|
||||
len(CLIReversePrimer()))+obiutils.Min(len(CLIForwardPrimer()),
|
||||
len(CLIReversePrimer()))/2,
|
||||
100,
|
||||
obioptions.CLIParallelWorkers(),
|
||||
|
@ -63,7 +63,7 @@ func IndexSequence(seqidx int,
|
||||
if lca[order] == ancestor {
|
||||
// nseq[i]++
|
||||
if mini != -1 {
|
||||
wordmin = obiutils.MaxInt(sequence.Len(), references[order].Len()) - 3 - 4*mini
|
||||
wordmin = obiutils.Max(sequence.Len(), references[order].Len()) - 3 - 4*mini
|
||||
}
|
||||
|
||||
if cw[order] < wordmin {
|
||||
@ -189,7 +189,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
indexed := obiiter.MakeIBioSequence()
|
||||
go func() {
|
||||
for i := 0; i < len(references); i += 10 {
|
||||
limits <- [2]int{i, obiutils.MinInt(i+10, len(references))}
|
||||
limits <- [2]int{i, obiutils.Min(i+10, len(references))}
|
||||
}
|
||||
close(limits)
|
||||
}()
|
||||
|
@ -110,7 +110,7 @@ func FindClosests(sequence *obiseq.BioSequence,
|
||||
d, _, _, _ := obialign.D1Or0(sequence, references[order])
|
||||
if d >= 0 {
|
||||
score = d
|
||||
alilength = obiutils.MaxInt(sequence.Len(), ref.Len())
|
||||
alilength = obiutils.Max(sequence.Len(), ref.Len())
|
||||
lcs = alilength - score
|
||||
}
|
||||
} else {
|
||||
|
@ -12,6 +12,11 @@ var _chunks = 100
|
||||
var _NAValue = "NA"
|
||||
var _NoSingleton = false
|
||||
|
||||
// UniqueOptionSet sets up unique options for the obiuniq command.
|
||||
//
|
||||
// It configures various options such as merging attributes, category attributes,
|
||||
// defining the NA value, handling singleton sequences, choosing between in-memory
|
||||
// or disk storage, and specifying the chunk count for dataset division.
|
||||
func UniqueOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringSliceVar(&_StatsOn, "merge",
|
||||
1, 1,
|
||||
@ -40,25 +45,67 @@ func UniqueOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
// the obiuniq command
|
||||
//
|
||||
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
UniqueOptionSet(options)
|
||||
}
|
||||
|
||||
// CLIStatsOn returns the list of variables on witch statistics are computed.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns a slice of strings representing the statistics on values.
|
||||
func CLIStatsOn() []string {
|
||||
return _StatsOn
|
||||
}
|
||||
|
||||
// SetStatsOn sets the list of variables on witch statistics are computed.
|
||||
//
|
||||
// It takes a slice of strings as its parameter and does not return anything.
|
||||
func SetStatsOn(statsOn []string) {
|
||||
_StatsOn = statsOn
|
||||
}
|
||||
|
||||
// AddStatsOn adds a variable to the list of variables on witch statistics are computed.
|
||||
//
|
||||
// Parameters:
|
||||
// - statsOn: variadic strings representing the statistics to be added.
|
||||
func AddStatsOn(statsOn ...string) {
|
||||
_StatsOn = append(_StatsOn, statsOn...)
|
||||
}
|
||||
|
||||
// CLIKeys returns the keys used to distinguished among identical sequences.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns a slice of strings representing the keys used by the CLI.
|
||||
func CLIKeys() []string {
|
||||
return _Keys
|
||||
}
|
||||
|
||||
// CLIUniqueInMemory returns if the unique function is running in memory only.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns a boolean value indicating whether the function is running in memory or not.
|
||||
func CLIUniqueInMemory() bool {
|
||||
return _InMemory
|
||||
}
|
||||
|
||||
// SetUniqueInMemory sets whether the unique function is running in memory or not.
|
||||
//
|
||||
// inMemory bool - A boolean value indicating whether the function is running in memory.
|
||||
// No return value.
|
||||
func SetUniqueInMemory(inMemory bool) {
|
||||
_InMemory = inMemory
|
||||
}
|
||||
|
||||
// CLINumberOfChunks returns the number of chunks used for the first bucket sort step used by the unique function.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns an integer representing the number of chunks.
|
||||
func CLINumberOfChunks() int {
|
||||
if _chunks <= 1 {
|
||||
return 1
|
||||
@ -67,10 +114,40 @@ func CLINumberOfChunks() int {
|
||||
return _chunks
|
||||
}
|
||||
|
||||
// SetNumberOfChunks sets the number of chunks used for the first bucket sort step used by the unique function.
|
||||
//
|
||||
// chunks int - The number of chunks to be set.
|
||||
// No return value.
|
||||
func SetNumberOfChunks(chunks int) {
|
||||
_chunks = chunks
|
||||
}
|
||||
|
||||
// CLINAValue returns the value used as a placeholder for missing values.
|
||||
//
|
||||
// No parameters.
|
||||
// Return type: string.
|
||||
func CLINAValue() string {
|
||||
return _NAValue
|
||||
}
|
||||
|
||||
// SetNAValue sets the NA value to the specified string.
|
||||
//
|
||||
// value string - The value to set as the NA value.
|
||||
func SetNAValue(value string) {
|
||||
_NAValue = value
|
||||
}
|
||||
|
||||
// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns a boolean value indicating whether or not singleton sequences should be discarded.
|
||||
func CLINoSingleton() bool {
|
||||
return _NoSingleton
|
||||
}
|
||||
|
||||
// SetNoSingleton sets the boolean value indicating whether or not singleton sequences should be discarded.
|
||||
//
|
||||
// noSingleton bool - The boolean value to set for _NoSingleton.
|
||||
func SetNoSingleton(noSingleton bool) {
|
||||
_NoSingleton = noSingleton
|
||||
}
|
@ -8,7 +8,7 @@ import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
)
|
||||
|
||||
func Unique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
func CLIUnique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
|
||||
options := make([]obichunk.WithOption, 0, 30)
|
||||
|
||||
|
@ -1,69 +1,32 @@
|
||||
package obiutils
|
||||
|
||||
import "golang.org/x/exp/constraints"
|
||||
import (
|
||||
"golang.org/x/exp/constraints"
|
||||
)
|
||||
|
||||
func MinInt(x, y int) int {
|
||||
func Min[T constraints.Ordered](x, y T) T {
|
||||
if x < y {
|
||||
return x
|
||||
}
|
||||
return y
|
||||
}
|
||||
|
||||
func MaxInt(x, y int) int {
|
||||
func Max[T constraints.Ordered](x, y T) T {
|
||||
if x < y {
|
||||
return y
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
func MinMaxInt(x, y int) (int, int) {
|
||||
func MinMax[T constraints.Ordered](x, y T) (T, T) {
|
||||
if x < y {
|
||||
return x, y
|
||||
}
|
||||
return y, x
|
||||
}
|
||||
|
||||
func MinUInt16(x, y uint16) uint16 {
|
||||
if x < y {
|
||||
return x
|
||||
}
|
||||
return y
|
||||
}
|
||||
|
||||
func MaxUInt16(x, y uint16) uint16 {
|
||||
if x < y {
|
||||
return y
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
func MinSlice[T constraints.Ordered](vec []T) T {
|
||||
if len(vec) == 0 {
|
||||
panic("empty slice")
|
||||
}
|
||||
min := vec[0]
|
||||
for _, v := range vec {
|
||||
if v < min {
|
||||
min = v
|
||||
}
|
||||
}
|
||||
return min
|
||||
}
|
||||
|
||||
func MaxSlice[T constraints.Ordered](vec []T) T {
|
||||
if len(vec) == 0 {
|
||||
panic("empty slice")
|
||||
}
|
||||
max := vec[0]
|
||||
for _, v := range vec {
|
||||
if v > max {
|
||||
max = v
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
func RangeSlice[T constraints.Ordered](vec []T) (min, max T) {
|
||||
func MinMaxSlice[T constraints.Ordered](vec []T) (min, max T) {
|
||||
if len(vec) == 0 {
|
||||
panic("empty slice")
|
||||
}
|
||||
|
@ -13,8 +13,8 @@ import (
|
||||
// Zero is a zero-valued uint128.
|
||||
var Zero Uint128
|
||||
|
||||
// Max is the largest possible uint128 value.
|
||||
var Max = New(math.MaxUint64, math.MaxUint64)
|
||||
// MaxUint128 is the largest possible uint128 value.
|
||||
var MaxUint128 = New(math.MaxUint64, math.MaxUint64)
|
||||
|
||||
// A Uint128 is an unsigned 128-bit number.
|
||||
type Uint128 struct {
|
||||
|
Reference in New Issue
Block a user