Add obiminion first version

Former-commit-id: aa5ace7bd4d2266333715fca7094d1c3cbbb5e6d
This commit is contained in:
Eric Coissac
2024-05-14 08:16:12 +02:00
parent 9e63013bc2
commit 017030bcce
24 changed files with 1599 additions and 469 deletions

View File

@ -0,0 +1,33 @@
package main
import (
"os"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiminion"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
)
func main() {
optionParser := obioptions.GenerateOptionParser(obiminion.OptionSet)
_, args := optionParser(os.Args)
fs, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
log.Errorf("Cannot open file (%v)", err)
os.Exit(1)
}
cleaned := obiminion.CLIOBIMinion(fs)
obiconvert.CLIWriteBioSequences(cleaned, true)
obiiter.WaitForLastPipe()
}

View File

@ -43,7 +43,7 @@ func main() {
os.Exit(1)
}
unique := obiuniq.Unique(sequences)
unique := obiuniq.CLIUnique(sequences)
obiconvert.CLIWriteBioSequences(unique, true)
obiiter.WaitForLastPipe()

View File

@ -58,182 +58,3 @@ var _empty = encodeValues(0, 0, false)
var _out = encodeValues(0, 30000, true)
var _notavail = encodeValues(0, 30000, false)
// func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
// lA := seqA.Len()
// lB := seqB.Len()
// // Ensure that A is the longest
// if lA < lB {
// seqA, seqB = seqB, seqA
// lA, lB = lB, lA
// }
// if maxError == -1 {
// maxError = lA * 2
// }
// delta := lA - lB
// // The difference of length is larger the maximum allowed errors
// if delta > maxError {
// return -1, -1
// }
// // Doit-on vraiment diviser par deux ??? pas certain
// extra := (maxError - delta) + 1
// even := 1 + delta + 2*extra
// width := 2*even - 1
// if buffer == nil {
// var local []uint64
// buffer = &local
// }
// if cap(*buffer) < 2*width {
// *buffer = make([]uint64, 3*width)
// }
// previous := (*buffer)[0:width]
// current := (*buffer)[width:(2 * width)]
// previous[extra] = _empty
// previous[extra+even] = encodeValues(0, 1, false)
// previous[extra+even-1] = encodeValues(0, 1, false)
// N := lB + ((delta) >> 1)
// bA := seqA.Sequence()
// bB := seqB.Sequence()
// // log.Println("N = ", N)
// for y := 1; y <= N; y++ {
// // in_matrix := false
// x1 := y - lB + extra
// x2 := extra - y
// xs := obiutils.MaxInt(obiutils.MaxInt(x1, x2), 0)
// x1 = y + extra
// x2 = lA + extra - y
// xf := obiutils.MinInt(obiutils.MinInt(x1, x2), even-1) + 1
// for x := xs; x < xf; x++ {
// i := y - x + extra
// j := y + x - extra
// var Sdiag, Sleft, Sup uint64
// switch {
// case i == 0:
// Sup = _notavail
// Sdiag = _notavail
// Sleft = encodeValues(0, j-1, false)
// case j == 0:
// Sup = encodeValues(0, i-1, false)
// Sdiag = _notavail
// Sleft = _notavail
// default:
// Sdiag = previous[x]
// if bA[j-1] == bB[i-1] {
// Sdiag = _incscore(Sdiag)
// }
// if x < (even - 1) {
// Sup = previous[x+even]
// } else {
// Sup = _out
// }
// if x > 0 {
// Sleft = previous[x+even-1]
// } else {
// Sleft = _out
// }
// }
// var score uint64
// switch {
// case Sdiag >= Sup && Sdiag >= Sleft:
// score = Sdiag
// case Sup >= Sleft:
// score = Sup
// default:
// score = Sleft
// }
// if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) {
// score = _setout(score)
// }
// current[x] = _incpath(score)
// }
// // . 9 10 + 2 - 1
// x1 = y - lB + extra + even
// x2 = extra - y + even - 1
// xs = obiutils.MaxInt(obiutils.MaxInt(x1, x2), even)
// x1 = y + extra + even
// x2 = lA + extra - y + even - 1
// xf = obiutils.MinInt(obiutils.MinInt(x1, x2), width-1) + 1
// for x := xs; x < xf; x++ {
// i := y - x + extra + even
// j := y + x - extra - even + 1
// var Sdiag, Sleft, Sup uint64
// switch {
// case i == 0:
// Sup = _notavail
// Sdiag = _notavail
// Sleft = encodeValues(0, j-1, false)
// case j == 0:
// Sup = encodeValues(0, i-1, false)
// Sdiag = _notavail
// Sleft = _notavail
// default:
// Sdiag = previous[x]
// if bA[j-1] == bB[i-1] {
// Sdiag = _incscore(Sdiag)
// }
// Sleft = current[x-even]
// Sup = current[x-even+1]
// }
// var score uint64
// switch {
// case Sdiag >= Sup && Sdiag >= Sleft:
// score = Sdiag
// case Sup >= Sleft:
// score = Sup
// default:
// score = Sleft
// }
// if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) {
// score = _setout(score)
// }
// current[x] = _incpath(score)
// }
// previous, current = current, previous
// }
// s, l, o := decodeValues(previous[(delta%2)*even+extra+(delta>>1)])
// if o {
// return -1, -1
// }
// return s, l
// }

View File

@ -130,11 +130,11 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
// in_matrix := false
x1 := y - lB + extra
x2 := extra - y
xs := obiutils.MaxInt(obiutils.MaxInt(x1, x2), 0)
xs := obiutils.Max(obiutils.Max(x1, x2), 0)
x1 = y + extra
x2 = lA + extra - y
xf := obiutils.MinInt(obiutils.MinInt(x1, x2), even-1) + 1
xf := obiutils.Min(obiutils.Min(x1, x2), even-1) + 1
for x := xs; x < xf; x++ {
@ -222,11 +222,11 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
// . 9 10 + 2 - 1
x1 = y - lB + extra + even
x2 = extra - y + even - 1
xs = obiutils.MaxInt(obiutils.MaxInt(x1, x2), even)
xs = obiutils.Max(obiutils.Max(x1, x2), even)
x1 = y + extra + even
x2 = lA + extra - y + even - 1
xf = obiutils.MinInt(obiutils.MinInt(x1, x2), width-1) + 1
xf = obiutils.Min(obiutils.Min(x1, x2), width-1) + 1
for x := xs; x < xf; x++ {
@ -348,16 +348,15 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
// - Matching: 1
// - Mismatch or gap: 0
//
// Compared to FastLCSScoreByte the length of the shortest alignment returned does not include the end-gaps.
// Parameters:
// - seqA: The first bio sequence.
// - seqB: The second bio sequence.
// - maxError: The maximum allowed error between the sequences. If set to -1, no limit is applied.
// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created.
//
// if buffer != nil, the buffer is used to store intermediate results.
// Otherwise, a new buffer is allocated.
//
// seqA: The first bio sequence.
// seqB: The second bio sequence.
// maxError: The maximum allowed error between the sequences.
// buffer: A buffer to store intermediate results.
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
// Returns:
// - The score of the longest common subsequence.
// - The length of the shortest alignment corresponding to the LCS.
func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer)
}
@ -372,14 +371,16 @@ func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uin
// - Matching: 1
// - Mismatch or gap: 0
//
// if buffer != nil, the buffer is used to store intermediate results.
// Otherwise, a new buffer is allocated.
// Parameters:
// - seqA: The first bio sequence.
// - seqB: The second bio sequence.
// - maxError: The maximum allowed error between the sequences. If set to -1, no limit is applied.
// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created.
//
// seqA: The first bio sequence.
// seqB: The second bio sequence.
// maxError: The maximum allowed error between the sequences.
// buffer: A buffer to store intermediate results.
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
// Returns:
// - The score of the longest common subsequence.
// - The length of the shortest alignment corresponding to the LCS.
func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer)
}

View File

@ -348,8 +348,8 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (
start = best[0] - nerr
end = best[0] + int(pattern.pointer.pointer.patlen) + nerr
start = obiutils.MaxInt(start, 0)
end = obiutils.MinInt(end, sequence.Len())
start = obiutils.Max(start, 0)
end = obiutils.Min(end, sequence.Len())
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
frg := sequence.pointer.reference.Sequence()[start:end]
@ -387,8 +387,8 @@ func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int)
if m[2] > 0 && pattern.pointer.pointer.hasIndel {
start := m[0] - m[2]
end := m[0] + int(pattern.pointer.pointer.patlen) + m[2]
start = obiutils.MaxInt(start, 0)
end = obiutils.MinInt(end, sequence.Len())
start = obiutils.Max(start, 0)
end = obiutils.Min(end, sequence.Len())
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
frg := sequence.pointer.reference.Sequence()[start:end]

327
pkg/obigraph/graph.go Normal file
View File

@ -0,0 +1,327 @@
package obigraph
import (
"bytes"
"fmt"
"io"
"math"
"os"
"text/template"
log "github.com/sirupsen/logrus"
)
type Edge[T any] struct {
From int
To int
Data *T
}
type Edges[T any] map[int]map[int]*T
type Graph[V any, T any] struct {
Name string
Vertices *[]V
Edges *Edges[T]
ReverseEdges *Edges[T]
VertexWeight func(int) float64
VertexId func(int) string
EdgeWeight func(int, int) float64
}
func NewEdges[T any]() *Edges[T] {
e := make(map[int]map[int]*T)
return (*Edges[T])(&e)
}
// AddEdge adds an edge to the graph between two vertices.
//
// Parameters:
// - from: the index of the starting vertex.
// - to: the index of the ending vertex.
// - data: a pointer to the data associated with the edge.
func (e *Edges[T]) AddEdge(from, to int, data *T) {
fnode, ok := (*e)[from]
if !ok {
fnode = make(map[int]*T)
(*e)[from] = fnode
}
fnode[to] = data
}
// NewGraph creates a new graph with the specified name and vertices.
//
// Parameters:
// - name: a string representing the name of the graph.
// - vertices: a slice of vertices of type V.
//
// Returns:
// - Graph[V, T]: the newly created graph.
func NewGraph[V, T any](name string, vertices *[]V) *Graph[V, T] {
return &Graph[V, T]{
Name: name,
Vertices: vertices,
Edges: NewEdges[T](),
ReverseEdges: NewEdges[T](),
VertexWeight: func(i int) float64 {
return 1.0
},
EdgeWeight: func(i, j int) float64 {
return 1.0
},
VertexId: func(i int) string {
return fmt.Sprintf("V%d", i)
},
}
}
// AddEdge adds an edge between two vertices in the graph.
//
// Parameters:
// - from: the index of the starting vertex.
// - to: the index of the ending vertex.
// - data: a pointer to the data associated with the edge.
func (g *Graph[V, T]) AddEdge(from, to int, data *T) {
lv := len(*g.Vertices)
if from >= lv || to >= lv {
log.Errorf("out of bounds vertex index: %d or %d (max: %d)", from, to, lv-1)
}
g.Edges.AddEdge(from, to, data)
g.Edges.AddEdge(to, from, data)
g.ReverseEdges.AddEdge(to, from, data)
g.ReverseEdges.AddEdge(from, to, data)
}
// AddDirectedEdge adds a directed edge from one vertex to another in the graph.
//
// Parameters:
// - from: an integer representing the index of the starting vertex.
// - to: an integer representing the index of the ending vertex.
// - data: a pointer to the data associated with the edge.
func (g *Graph[V, T]) AddDirectedEdge(from, to int, data *T) {
lv := len(*g.Vertices)
if from >= lv || to >= lv {
log.Errorf("out of bounds vertex index: %d or %d (max: %d)", from, to, lv-1)
}
g.Edges.AddEdge(from, to, data)
g.ReverseEdges.AddEdge(to, from, data)
}
// SetAsDirectedEdge sets the edge from one vertex to another as directed in the graph.
//
// Parameters:
// - from: an integer representing the index of the starting vertex.
// - to: an integer representing the index of the ending vertex.
func (g *Graph[V, T]) SetAsDirectedEdge(from, to int) {
lv := len(*g.Vertices)
if from >= lv || to >= lv {
log.Errorf("out of bounds vertex index: %d or %d (max: %d)", from, to, lv-1)
}
if _, ok := (*g.Edges)[from][to]; ok {
if _, ok := (*g.Edges)[to][from]; ok {
delete((*g.Edges)[to], from)
delete((*g.Edges)[from], to)
}
return
}
log.Error("no edge from ", from, " to ", to)
}
// Neighbors generates a list of neighbor vertices for a given vertex index in the graph.
//
// Parameters:
// - v: an integer representing the index of the vertex.
// Returns:
// - []int: a list of neighbor vertices.
func (g *Graph[V, T]) Neighbors(v int) []int {
if neighbors, ok := (*g.Edges)[v]; ok {
rep := make([]int, 0, len(neighbors))
for k := range neighbors {
rep = append(rep, k)
}
return rep
}
return nil
}
// Degree calculates the degree of a vertex in a graph.
//
// Parameters:
// - v: an integer representing the index of the vertex.
//
// Returns:
// - an integer representing the degree of the vertex.
func (g *Graph[V, T]) Degree(v int) int {
if neighbors, ok := (*g.Edges)[v]; ok {
return len(neighbors)
}
return 0
}
// Parents returns a list of parent vertices for a given vertex index in the graph.
//
// Parameters:
// - v: an integer representing the index of the vertex.
//
// Returns:
// - []int: a list of parent vertices.
func (g *Graph[V, T]) Parents(v int) []int {
if parents, ok := (*g.ReverseEdges)[v]; ok {
rep := make([]int, 0, len(parents))
for k := range parents {
rep = append(rep, k)
}
return rep
}
return nil
}
// ParentDegree calculates the degree of a vertex in a graph by counting the number of its parent vertices.
//
// Parameters:
// - v: an integer representing the index of the vertex.
//
// Returns:
// - an integer representing the degree of the vertex.
func (g *Graph[V, T]) ParentDegree(v int) int {
if parents, ok := (*g.ReverseEdges)[v]; ok {
return len(parents)
}
return 0
}
type gml_graph[V any, T any] struct {
Graph *Graph[V, T]
As_directed bool
Min_degree int
Threshold float64
Scale int
}
// Gml generates a GML representation of the graph.
//
// as_directed: whether the graph should be treated as directed or undirected.
// threshold: the threshold value.
// scale: the scaling factor.
// string: the GML representation of the graph.
func (g *Graph[V, T]) Gml(as_directed bool, min_degree int, threshold float64, scale int) string {
// (*seqs)[1].Count
var gml bytes.Buffer
data := gml_graph[V, T]{
Graph: g,
As_directed: as_directed,
Min_degree: min_degree,
Threshold: threshold,
Scale: scale,
}
digraphTpl := template.New("gml_digraph")
digraph := ` {{$context := .}}
graph [
comment "{{ if $context.As_directed }}Directed graph{{ else }}Undirected graph{{ end }} {{ Name }}"
directed {{ if $context.As_directed }}1{{ else }}0{{ end }}
{{range $index, $data:= $context.Graph.Vertices}}
{{ if (ge (Degree $index) $context.Min_degree)}}
node [
id {{$index}}
graphics [
type "{{ Shape $index }}"
h {{ Sqrt (VertexWeight $index) }}
w {{ Sqrt (VertexWeight $index) }}
]
]
{{ end }}
{{ end }}
{{range $source, $data:= $context.Graph.Edges}}
{{range $target, $edge:= $data}}
{{ if and (ge $source $context.Min_degree) (ge $target $context.Min_degree) (or $context.As_directed (lt $source $target))}}
edge [ source {{$source}}
target {{$target}}
color "#00FF00"
]
{{ end }}
{{ end }}
{{ end }}
]
`
tmpl, err := digraphTpl.Funcs(template.FuncMap{
"Sqrt": func(i float64) int { return scale * int(math.Floor(math.Sqrt(i))) },
"Name": func() string { return g.Name },
"VertexId": func(i int) string { return g.VertexId(i) },
"Degree": func(i int) int { return g.Degree(i) },
"VertexWeight": func(i int) float64 { return g.VertexWeight(i) },
"Shape": func(i int) string {
if g.VertexWeight(i) >= threshold {
return "circle"
} else {
return "rectangle"
}
},
}).Parse(digraph)
if err != nil {
panic(err)
}
err = tmpl.Execute(&gml, data)
if err != nil {
panic(err)
}
return gml.String()
}
// WriteGml writes the GML representation of the graph to an io.Writer.
//
// w: the io.Writer to write the GML representation to.
// as_directed: whether the graph should be treated as directed or undirected.
// threshold: the threshold value.
// scale: the scaling factor.
func (g *Graph[V, T]) WriteGml(w io.Writer, as_directed bool, min_degree int, threshold float64, scale int) {
_, err := w.Write([]byte(g.Gml(as_directed, min_degree, threshold, scale)))
if err != nil {
panic(err)
}
}
// WriteGmlFile writes the graph in GML format to the specified file.
//
// filename: the name of the file to write the GML representation to.
// as_directed: whether the graph should be treated as directed or undirected.
// threshold: the threshold value.
// scale: the scaling factor.
func (g *Graph[V, T]) WriteGmlFile(filename string, as_directed bool, min_degree int, threshold float64, scale int) {
f, err := os.Create(filename)
if err != nil {
panic(err)
}
defer f.Close()
g.WriteGml(f, as_directed, min_degree, threshold, scale)
}

104
pkg/obigraph/graphbuffer.go Normal file
View File

@ -0,0 +1,104 @@
package obigraph
import (
"io"
"os"
)
type GraphBuffer[V, T any] struct {
Graph *Graph[V, T]
Channel chan Edge[T]
}
// NewGraphBuffer creates a new GraphBuffer with the given name and vertices.
//
// Parameters:
// - name: the name of the GraphBuffer.
// - vertices: a slice of vertices to initialize the GraphBuffer.
//
// Returns:
// - GraphBuffer[V, T]: the newly created GraphBuffer.
func NewGraphBuffer[V, T any](name string, vertices *[]V) *GraphBuffer[V, T] {
buffer := GraphBuffer[V, T]{
Graph: NewGraph[V, T](name, vertices),
Channel: make(chan Edge[T]),
}
go func() {
for edge := range buffer.Channel {
buffer.Graph.AddEdge(edge.From, edge.To, edge.Data)
}
}()
return &buffer
}
// AddEdge adds an edge to the GraphBuffer.
//
// Parameters:
// - from: the index of the starting vertex.
// - to: the index of the ending vertex.
// - data: a pointer to the data associated with the edge.
func (g *GraphBuffer[V, T]) AddEdge(from, to int, data *T) {
g.Channel <- Edge[T]{
From: from,
To: to,
Data: data,
}
}
// AddDirectedEdge adds a directed edge from one vertex to another in the GraphBuffer.
//
// Parameters:
// - from: the index of the starting vertex.
// - to: the index of the ending vertex.
// - data: a pointer to the data associated with the edge.
func (g *GraphBuffer[V, T]) AddDirectedEdge(from, to int, data *T) {
g.Channel <- Edge[T]{
From: from,
To: to,
Data: data,
}
}
// Gml generates a GML representation of the graph.
//
// as_directed: whether the graph should be treated as directed or undirected.
// min_degree: the minimum degree of vertices to include in the GML representation.
// threshold: the threshold value.
// scale: the scaling factor.
// string: the GML representation of the graph.
func (g *GraphBuffer[V, T]) Gml(as_directed bool, min_degree int, threshold float64, scale int) string {
return g.Graph.Gml(as_directed, min_degree, threshold, scale)
}
func (g *GraphBuffer[V, T]) WriteGmlFile(filename string, as_directed bool, min_degree int, threshold float64, scale int) {
f, err := os.Create(filename)
if err != nil {
panic(err)
}
defer f.Close()
g.WriteGml(f, as_directed, min_degree, threshold, scale)
}
// WriteGml writes the GML representation of the graph to an io.Writer.
//
// w: the io.Writer to write the GML representation to.
// as_directed: whether the graph should be treated as directed or undirected.
// min_degree: the minimum degree of vertices to include in the GML representation.
// threshold: the threshold value.
// scale: the scaling factor.
func (g *GraphBuffer[V, T]) WriteGml(w io.Writer, as_directed bool, min_degree int, threshold float64, scale int) {
_, err := w.Write([]byte(g.Gml(as_directed, min_degree, threshold, scale)))
if err != nil {
panic(err)
}
}
// Close closes the GraphBuffer by closing its channel.
//
// No parameters.
func (g *GraphBuffer[V, T]) Close() {
close(g.Channel)
}

View File

@ -30,7 +30,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
news = append(news, s)
} else {
for i := 0; i < s.Len(); i += step {
end := obiutils.MinInt(i+length, s.Len())
end := obiutils.Min(i+length, s.Len())
fusion := false
if (s.Len() - end) < step {
end = s.Len()

View File

@ -9,9 +9,10 @@ import (
type Table4mer [256]uint16
func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer {
iternal_buffer := Encode4mer(seq, buffer)
func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer {
iternal_buffer := Encode4mer(seq, buffer) // The slice of 4-mer codes
if counts == nil {
var w Table4mer
@ -19,7 +20,7 @@ func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Tabl
}
// Every cells of the counter is set to zero
for i := 0; i < 256; i++ {
for i := 0; i < 256; i++ { // 256 is the number of possible 4-mer codes
(*counts)[i] = 0
}
@ -32,7 +33,7 @@ func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Tabl
func Common4Mer(count1, count2 *Table4mer) int {
sum := 0
for i := 0; i < 256; i++ {
sum += int(obiutils.MinUInt16((*count1)[i], (*count2)[i]))
sum += int(obiutils.Min((*count1)[i], (*count2)[i]))
}
return sum
}
@ -48,7 +49,7 @@ func Sum4Mer(count *Table4mer) int {
func LCS4MerBounds(count1, count2 *Table4mer) (int, int) {
s1 := Sum4Mer(count1)
s2 := Sum4Mer(count2)
smin := obiutils.MinInt(s1, s2)
smin := obiutils.Min(s1, s2)
cw := Common4Mer(count1, count2)
@ -65,7 +66,7 @@ func LCS4MerBounds(count1, count2 *Table4mer) (int, int) {
func Error4MerBounds(count1, count2 *Table4mer) (int, int) {
s1 := Sum4Mer(count1)
s2 := Sum4Mer(count2)
smax := obiutils.MaxInt(s1, s2)
smax := obiutils.Max(s1, s2)
cw := Common4Mer(count1, count2)

View File

@ -2,13 +2,16 @@ package obikmer
import (
"bytes"
"container/heap"
"fmt"
"math"
"math/bits"
"slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/daichi-m/go18ds/sets/linkedhashset"
"github.com/daichi-m/go18ds/stacks/arraystack"
log "github.com/sirupsen/logrus"
)
type KmerIdx32 uint32
@ -49,36 +52,60 @@ type KmerIdx_t interface {
}
type DeBruijnGraph struct {
kmersize int
kmermask uint64
prevc uint64
kmersize int // k-mer size
kmermask uint64 // mask used to set to 0 the bits that are not in the k-mer
prevc uint64 //
prevg uint64
prevt uint64
graph map[uint64]uint
graph map[uint64]uint // Kmer are encoded as uint64 with 2 bits per character
}
// MakeDeBruijnGraph creates a De Bruijn Graph with the specified k-mer size.
//
// Parameters:
//
// kmersize int - the size of the k-mers
//
// Returns:
//
// *DeBruijnGraph - a pointer to the created De Bruijn's Graph
func MakeDeBruijnGraph(kmersize int) *DeBruijnGraph {
g := DeBruijnGraph{
kmersize: kmersize,
kmermask: ^(^uint64(0) << (uint64(kmersize+1) * 2)),
prevc: uint64(1) << (uint64(kmersize) * 2),
prevg: uint64(2) << (uint64(kmersize) * 2),
prevt: uint64(3) << (uint64(kmersize) * 2),
kmermask: ^(^uint64(0) << (uint64(kmersize) * 2)), // k-mer mask used to set to 0 the bits that are not in the k-mer
prevc: uint64(1) << (uint64(kmersize-1) * 2),
prevg: uint64(2) << (uint64(kmersize-1) * 2),
prevt: uint64(3) << (uint64(kmersize-1) * 2),
graph: make(map[uint64]uint),
}
return &g
}
// KmerSize returns the size of the k-mers in the DeBruijn graph.
//
// This function takes no parameters.
// It returns an integer representing the size of the k-mers.
func (g *DeBruijnGraph) KmerSize() int {
return g.kmersize
}
// Len returns the length of the graph.
//
// This function takes no parameters.
// It returns an integer representing the number of nodes in the graph.
func (g *DeBruijnGraph) Len() int {
return len(g.graph)
}
func (g *DeBruijnGraph) MaxLink() int {
// MaxWeight returns the maximum weight of a node from the DeBruijn's Graph.
//
// It iterates over each count in the graph map and updates the max value if the current count is greater.
// Finally, it returns the maximum weight as an integer.
//
// Returns:
// - int: the maximum weight value.
func (g *DeBruijnGraph) MaxWeight() int {
max := uint(0)
for _, count := range g.graph {
if count > max {
@ -89,8 +116,12 @@ func (g *DeBruijnGraph) MaxLink() int {
return int(max)
}
func (g *DeBruijnGraph) LinkSpectrum() []int {
max := g.MaxLink()
// WeightSpectrum calculates the weight spectrum of nodes in the DeBruijn's graph.
//
// No parameters.
// Returns an array of integers representing the weight spectrum.
func (g *DeBruijnGraph) WeightSpectrum() []int {
max := g.MaxWeight()
spectrum := make([]int, max+1)
for _, count := range g.graph {
spectrum[int(count)]++
@ -99,7 +130,10 @@ func (g *DeBruijnGraph) LinkSpectrum() []int {
return spectrum
}
func (g *DeBruijnGraph) FilterMin(min int) {
// FilterMinWeight filters the DeBruijnGraph by removing nodes with weight less than the specified minimum.
//
// min: an integer representing the minimum count threshold.
func (g *DeBruijnGraph) FilterMinWeight(min int) {
umin := uint(min)
for idx, count := range g.graph {
if count < umin {
@ -109,8 +143,12 @@ func (g *DeBruijnGraph) FilterMin(min int) {
}
func (g *DeBruijnGraph) Previouses(index uint64) []uint64 {
if _, ok := g.graph[index]; !ok {
log.Panicf("k-mer %s (index %d) is not in graph", g.DecodeNode(index), index)
}
rep := make([]uint64, 0, 4)
index = index >> 2
index >>= 2
if _, ok := g.graph[index]; ok {
rep = append(rep, index)
@ -135,6 +173,10 @@ func (g *DeBruijnGraph) Previouses(index uint64) []uint64 {
}
func (g *DeBruijnGraph) Nexts(index uint64) []uint64 {
if _, ok := g.graph[index]; !ok {
log.Panicf("k-mer %s (index %d) is not in graph", g.DecodeNode(index), index)
}
rep := make([]uint64, 0, 4)
index = (index << 2) & g.kmermask
@ -160,11 +202,11 @@ func (g *DeBruijnGraph) Nexts(index uint64) []uint64 {
return rep
}
func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) {
func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, int, bool) {
ns := g.Nexts(index)
if len(ns) == 0 {
return uint64(0), false
return uint64(0), 0, false
}
max := uint(0)
@ -177,7 +219,34 @@ func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) {
}
}
return rep, true
return rep, int(max), true
}
func (g *DeBruijnGraph) Heads() []uint64 {
rep := make([]uint64, 0, 10)
for k := range g.graph {
if len(g.Previouses(k)) == 0 {
rep = append(rep, k)
}
}
return rep
}
func (g *DeBruijnGraph) MaxHead() (uint64, int, bool) {
rep := uint64(0)
max := uint(0)
found := false
for k, w := range g.graph {
if len(g.Previouses(k)) == 0 && w > max {
rep = k
max = w
found = true
}
}
return rep, int(max), found
}
func (g *DeBruijnGraph) MaxPath() []uint64 {
@ -185,17 +254,17 @@ func (g *DeBruijnGraph) MaxPath() []uint64 {
ok := false
idx := uint64(0)
idx, ok = g.MaxHead()
idx, _, ok = g.MaxHead()
for ok {
path = append(path, idx)
idx, ok = g.MaxNext(idx)
idx, _, ok = g.MaxNext(idx)
}
return path
}
func (g *DeBruijnGraph) LongestPath() []uint64 {
func (g *DeBruijnGraph) LongestPath(max_length int) []uint64 {
var path []uint64
wmax := uint(0)
ok := true
@ -209,7 +278,11 @@ func (g *DeBruijnGraph) LongestPath() []uint64 {
nw := g.graph[idx]
w += nw
lp = append(lp, idx)
idx, ok = g.MaxNext(idx)
idx, _, ok = g.MaxNext(idx)
if max_length > 0 && len(lp) > max_length {
ok = false
w = 0
}
}
if w > wmax {
@ -221,8 +294,9 @@ func (g *DeBruijnGraph) LongestPath() []uint64 {
return path
}
func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error) {
path := g.LongestPath()
func (g *DeBruijnGraph) LongestConsensus(id string, max_length int) (*obiseq.BioSequence, error) {
//path := g.LongestPath(max_length)
path := g.HaviestPath()
s := g.DecodePath(path)
if len(s) > 0 {
@ -238,37 +312,10 @@ func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error)
return nil, fmt.Errorf("cannot identify optimum path")
}
func (g *DeBruijnGraph) Heads() []uint64 {
rep := make([]uint64, 0, 10)
for k := range g.graph {
if len(g.Previouses(k)) == 0 {
rep = append(rep, k)
}
}
return rep
}
func (g *DeBruijnGraph) MaxHead() (uint64, bool) {
rep := uint64(0)
max := uint(0)
found := false
for k, w := range g.graph {
if len(g.Previouses(k)) == 0 && w > max {
rep = k
found = true
}
}
return rep, found
}
func (g *DeBruijnGraph) DecodeNode(index uint64) string {
rep := make([]byte, g.kmersize)
index >>= 2
for i := g.kmersize - 1; i >= 0; i-- {
rep[i], _ = decode[index&3]
rep[i] = decode[index&3]
index >>= 2
}
@ -282,7 +329,7 @@ func (g *DeBruijnGraph) DecodePath(path []uint64) string {
if len(path) > 0 {
buf.WriteString(g.DecodeNode(path[0]))
for _, idx := range path {
for _, idx := range path[1:] {
buf.WriteByte(decode[idx&3])
}
}
@ -307,6 +354,13 @@ func (g *DeBruijnGraph) BestConsensus(id string) (*obiseq.BioSequence, error) {
return nil, fmt.Errorf("cannot identify optimum path")
}
// Weight returns the weight of the node at the given index in the DeBruijnGraph.
//
// Parameters:
// - index: the index of the node in the graph.
//
// Returns:
// - int: the weight of the node.
func (g *DeBruijnGraph) Weight(index uint64) int {
val, ok := g.graph[index]
if !ok {
@ -315,59 +369,64 @@ func (g *DeBruijnGraph) Weight(index uint64) int {
return int(val)
}
// append appends a sequence of nucleotides to the DeBruijnGraph.
//
// Parameters:
// - sequence: a byte slice representing the sequence of nucleotides to append.
// - current: the current node in the graph to which the sequence will be appended.
// - weight: the weight of the added nodes.
func (graph *DeBruijnGraph) append(sequence []byte, current uint64, weight int) {
for i := 0; i < len(sequence); i++ {
current <<= 2
current &= graph.kmermask
b := iupac[sequence[i]]
if len(b) == 1 {
current |= b[0]
graph.graph[current] = uint(graph.Weight(current) + weight)
} else {
for j := 0; j < len(b); j++ {
current &= ^uint64(3)
current |= b[j]
graph.graph[current] = uint(graph.Weight(current) + weight)
graph.append(sequence[(i+1):], current, weight)
}
if len(sequence) == 0 {
return
}
current <<= 2
current &= graph.kmermask
b := iupac[sequence[0]]
current |= b[0]
graph.graph[current] = uint(graph.Weight(current) + weight)
graph.append(sequence[1:], current, weight)
for j := 1; j < len(b); j++ {
current &= ^uint64(3)
current |= b[j]
graph.graph[current] = uint(graph.Weight(current) + weight)
graph.append(sequence[1:], current, weight)
}
}
func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) {
key := uint64(0)
s := sequence.Sequence()
w := sequence.Count()
init := make([]uint64, 0, 16)
var f func(start int, key uint64)
f = func(start int, key uint64) {
for i := start; i < graph.kmersize; i++ {
s := sequence.Sequence() // Get the sequence as a byte slice
w := sequence.Count() // Get the weight of the sequence
var initFirstKmer func(start int, key uint64)
// Initialize the first k-mer
// start is the index of the nucleotide in the k-mer to add
// key is the value of the k-mer index before adding the start nucleotide
initFirstKmer = func(start int, key uint64) {
if start == 0 {
key = 0
}
if start < graph.kmersize {
key <<= 2
b := iupac[s[i]]
if len(b) == 1 {
key |= b[0]
} else {
for j := 0; j < len(b); j++ {
b := iupac[s[start]]
for _, code := range b {
key &= ^uint64(3)
key |= b[j]
f(i+1, key)
key |= code
initFirstKmer(start+1, key)
}
return
} else {
graph.graph[key] = uint(graph.Weight(key) + w)
graph.append(s[graph.kmersize:], key, w)
}
}
init = append(init, key&graph.kmermask)
}
if sequence.Len() > graph.kmersize {
f(0, key)
for _, idx := range init {
graph.append(s[graph.kmersize:], idx, w)
}
initFirstKmer(0, 0)
}
}
@ -381,32 +440,38 @@ func (graph *DeBruijnGraph) Gml() string {
`)
nodeidx := make(map[uint64]int)
nodeid := 0
for idx := range graph.graph {
nodeid++
nodeidx[idx] = nodeid
n := graph.Nexts(idx)
p := graph.Previouses(idx)
if len(n) == 0 || len(p) == 0 {
node := graph.DecodeNode(idx)
buffer.WriteString(
fmt.Sprintf("node [ id \"%s\" ]\n", node),
fmt.Sprintf("node [ id \"%d\" \n label \"%s\" ]\n", nodeid, node),
)
n := graph.Nexts(uint64(idx))
if len(n) == 0 {
idx <<= 2
idx &= graph.kmermask
node := graph.DecodeNode(idx)
} else {
buffer.WriteString(
fmt.Sprintf("node [ id \"%s\" \n label \"%s\" ]\n", node, node),
fmt.Sprintf("node [ id \"%d\" ]\n", nodeid),
)
}
}
for idx, weight := range graph.graph {
src := graph.DecodeNode(idx)
label := decode[idx&3]
idx <<= 2
idx &= graph.kmermask
dst := graph.DecodeNode(idx)
for idx := range graph.graph {
srcid := nodeidx[idx]
n := graph.Nexts(idx)
for _, dst := range n {
dstid := nodeidx[dst]
label := decode[dst&3]
weight := graph.Weight(dst)
buffer.WriteString(
fmt.Sprintf(`edge [ source "%s"
target "%s"
fmt.Sprintf(`edge [ source "%d"
target "%d"
color "#00FF00"
label "%c[%d]"
graphics [
@ -416,9 +481,11 @@ func (graph *DeBruijnGraph) Gml() string {
]
]
`, src, dst, label, weight, math.Log(float64(weight))),
`, srcid, dstid, label, weight, math.Sqrt(float64(weight))),
)
}
}
buffer.WriteString("]\n")
return buffer.String()
@ -472,3 +539,151 @@ func (g *DeBruijnGraph) HammingDistance(kmer1, kmer2 uint64) int {
ident &= 0x5555555555555555 & g.kmermask
return bits.OnesCount64(ident)
}
type UInt64Heap []uint64
func (h UInt64Heap) Len() int { return len(h) }
func (h UInt64Heap) Less(i, j int) bool { return h[i] < h[j] }
func (h UInt64Heap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *UInt64Heap) Push(x any) {
// Push and Pop use pointer receivers because they modify the slice's length,
// not just its contents.
*h = append(*h, x.(uint64))
}
func (h *UInt64Heap) Pop() any {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
func (g *DeBruijnGraph) HaviestPath() []uint64 {
if g.HasCycle() {
return nil
}
// Initialize the distance array and visited set
distances := make(map[uint64]int)
visited := make(map[uint64]bool)
prevNodes := make(map[uint64]uint64)
heaviestNode := uint64(0)
heaviestWeight := 0
queue := &UInt64Heap{}
heap.Init(queue)
startNodes := make(map[uint64]struct{})
for _, n := range g.Heads() {
startNodes[n] = struct{}{}
heap.Push(queue, n)
distances[n] = g.Weight(n)
prevNodes[n] = 0
visited[n] = false
}
// Priority queue to keep track of nodes to visit
for len(*queue) > 0 {
// Get the node with the smallest distance
currentNode := heap.Pop(queue).(uint64)
// If the current node has already been visited, skip it
if visited[currentNode] {
continue
}
// Mark the node as visited
visited[currentNode] = true
weight := distances[currentNode]
// Update the heaviest node
if weight > heaviestWeight {
heaviestWeight = weight
heaviestNode = currentNode
}
if currentNode == 0 {
log.Warn("current node is 0")
}
// Update the distance of the neighbors
nextNodes := g.Nexts(currentNode)
for _, nextNode := range nextNodes {
if nextNode == 0 {
log.Warn("next node is 0")
}
weight := g.Weight(nextNode) + distances[currentNode]
if distances[nextNode] < weight {
distances[nextNode] = weight
prevNodes[nextNode] = currentNode
visited[nextNode] = false
heap.Push(queue, nextNode)
// Keep track of the node with the heaviest weight
if weight > heaviestWeight {
heaviestWeight = weight
heaviestNode = nextNode
}
}
}
}
log.Infof("Heaviest node: %d [%v]", heaviestNode, heaviestWeight)
// Reconstruct the path from the start node to the heaviest node found
heaviestPath := make([]uint64, 0)
currentNode := heaviestNode
for _, ok := startNodes[currentNode]; !ok && !slices.Contains(heaviestPath, currentNode); _, ok = startNodes[currentNode] {
heaviestPath = append(heaviestPath, currentNode)
//log.Infof("Current node: %d <- %d", currentNode, prevNodes[currentNode])
currentNode = prevNodes[currentNode]
}
if slices.Contains(heaviestPath, currentNode) {
log.Fatalf("Cycle detected %v -> %v (%v) len(%v)", heaviestPath, currentNode, startNodes, len(heaviestPath))
return nil
}
heaviestPath = append(heaviestPath, currentNode)
// Reverse the path
slices.Reverse(heaviestPath)
return heaviestPath
}
func (g *DeBruijnGraph) HasCycle() bool {
// Initialize the visited and stack arrays
visited := make(map[uint64]bool)
stack := make(map[uint64]bool)
// Helper function to perform DFS
var dfs func(node uint64) bool
dfs = func(node uint64) bool {
visited[node] = true
stack[node] = true
nextNodes := g.Nexts(node)
for _, nextNode := range nextNodes {
if !visited[nextNode] {
if dfs(nextNode) {
return true
}
} else if stack[nextNode] {
return true
}
}
stack[node] = false
return false
}
// Perform DFS on each node to check for cycles
for node := range g.graph {
if !visited[node] {
if dfs(node) {
return true
}
}
}
return false
}

View File

@ -130,7 +130,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
sseq := sequence.String()
direct := sseq[start:end]
tagstart := obiutils.MaxInt(start-marker.taglength, 0)
tagstart := obiutils.Max(start-marker.taglength, 0)
ftag := strings.ToLower(sseq[tagstart:start])
m := DemultiplexMatch{
@ -150,7 +150,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
reverse, _ := sequence.Subsequence(start, end, false)
defer reverse.Recycle()
reverse = reverse.ReverseComplement(true)
endtag := obiutils.MinInt(end+marker.taglength, sequence.Len())
endtag := obiutils.Min(end+marker.taglength, sequence.Len())
rtag, err := sequence.Subsequence(end, endtag, false)
defer rtag.Recycle()
srtag := ""
@ -201,7 +201,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
sseq := sequence.String()
reverse := strings.ToLower(sseq[start:end])
tagstart := obiutils.MaxInt(start-marker.taglength, 0)
tagstart := obiutils.Max(start-marker.taglength, 0)
rtag := strings.ToLower(sseq[tagstart:start])
m := DemultiplexMatch{
@ -221,7 +221,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch {
defer direct.Recycle()
direct = direct.ReverseComplement(true)
endtag := obiutils.MinInt(end+marker.taglength, sequence.Len())
endtag := obiutils.Min(end+marker.taglength, sequence.Len())
ftag, err := sequence.Subsequence(end, endtag, false)
defer ftag.Recycle()
sftag := ""

View File

@ -38,13 +38,7 @@ import (
// return float64(m)
// }
// func minIntVector(values []int) float64 {
// m := values[0]
// for _, v := range values {
// if v < m {
// m = v
// }
// }
// return float64(m)
// }

View File

@ -27,7 +27,7 @@ func SuffixLess(suffixarray SuffixArray) func(i, j int) bool {
sj := suffixarray.Suffixes[j]
bj := (*suffixarray.Sequences)[int(sj.Idx)].Sequence()[sj.Pos:]
l := obiutils.MinInt(len(bi), len(bj))
l := obiutils.Min(len(bi), len(bj))
p := 0
for p < l && bi[p] == bj[p] {
p++
@ -92,7 +92,7 @@ func (suffixarray *SuffixArray) CommonSuffix() []int {
si := suffixarray.Suffixes[i]
bi := (*suffixarray.Sequences)[int(si.Idx)].Sequence()[si.Pos:]
l := obiutils.MinInt(len(bi), len(bp))
l := obiutils.Min(len(bi), len(bp))
p := 0
for p < l && bi[p] == bp[p] {
p++

View File

@ -4,92 +4,93 @@ import (
"fmt"
"os"
"path"
"sort"
"slices"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obisuffix"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
func BuildConsensus(seqs obiseq.BioSequenceSlice,
consensus_id string,
kmer_size int, quorum float64,
min_depth float64,
max_length int,
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
if save_graph {
if dirname == "" {
dirname = "."
}
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
// path does not exist or is not directory
os.RemoveAll(dirname)
err := os.Mkdir(dirname, 0755)
if err != nil {
log.Panicf("Cannot create directory %s for saving graphs", dirname)
}
}
fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s.fasta", consensus_id)))
if err == nil {
defer fasta.Close()
fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch(0, seqs), obiformats.FormatFastSeqJsonHeader, false))
fasta.Close()
}
}
log.Printf("Number of reads : %d\n", len(seqs))
if kmer_size < 0 {
longest := make([]int, len(seqs))
for i := range seqs {
s := seqs[i : i+1]
for i, seq := range seqs {
s := obiseq.BioSequenceSlice{seq}
sa := obisuffix.BuildSuffixArray(&s)
longest[i] = obiutils.MaxSlice(sa.CommonSuffix())
longest[i] = slices.Max(sa.CommonSuffix())
}
o := obiutils.Order(sort.IntSlice(longest))
i := int(float64(len(seqs)) * quorum)
// o := obiutils.Order(sort.IntSlice(longest))
// i := int(float64(len(seqs)) * quorum)
kmer_size = longest[o[i]] + 1
// if i >= len(o) {
// i = len(o) - 1
// }
kmer_size = slices.Max(longest) + 1
// kmer_size = longest[o[i]] + 1
log.Printf("estimated kmer size : %d", kmer_size)
}
graph := obikmer.MakeDeBruijnGraph(kmer_size)
var graph *obikmer.DeBruijnGraph
for {
graph = obikmer.MakeDeBruijnGraph(kmer_size)
for _, s := range seqs {
graph.Push(s)
}
log.Printf("Graph size : %d\n", graph.Len())
total_kmer := graph.Len()
threshold := 0
switch {
case min_depth < 0:
spectrum := graph.LinkSpectrum()
cum := make(map[int]int)
spectrum[1] = 0
for i := 2; i < len(spectrum); i++ {
spectrum[i] += spectrum[i-1]
cum[spectrum[i]]++
}
max := 0
kmax := 0
for k, obs := range cum {
if obs > max {
max = obs
kmax = k
}
}
for i, total := range spectrum {
if total == kmax {
threshold = i
if !graph.HasCycle() {
break
}
}
threshold /= 2
case min_depth >= 1:
threshold = int(min_depth)
default:
threshold = int(float64(len(seqs)) * min_depth)
}
graph.FilterMin(threshold)
log.Printf("Graph size : %d\n", graph.Len())
kmer_size++
log.Infof("Cycle detected, increasing kmer size to %d\n", kmer_size)
}
if save_graph {
file, err := os.Create(path.Join(dirname,
fmt.Sprintf("%s.gml", seqs[0].Source())))
fmt.Sprintf("%s_raw_consensus.gml", consensus_id)))
if err != nil {
fmt.Println(err)
@ -99,14 +100,72 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
}
}
id := seqs[0].Source()
if id == "" {
id = seqs[0].Id()
}
seq, err := graph.LongestConsensus(id)
log.Printf("Graph size : %d\n", graph.Len())
total_kmer := graph.Len()
// threshold := 0
// switch {
// case min_depth < 0:
// spectrum := graph.WeightSpectrum()
// cum := make(map[int]int)
// spectrum[1] = 0
// for i := 2; i < len(spectrum); i++ {
// spectrum[i] += spectrum[i-1]
// cum[spectrum[i]]++
// }
// max := 0
// kmax := 0
// for k, obs := range cum {
// if obs > max {
// max = obs
// kmax = k
// }
// }
// for i, total := range spectrum {
// if total == kmax {
// threshold = i
// break
// }
// }
// threshold /= 2
// if threshold < 1 {
// threshold = 1
// }
// log.Info("Estimated kmer_min_occur = ", threshold)
// case min_depth >= 1:
// threshold = int(min_depth)
// default:
// threshold = int(float64(len(seqs)) * min_depth)
// }
// graph.FilterMinWeight(threshold)
// log.Printf("Graph size : %d\n", graph.Len())
// if save_graph {
// file, err := os.Create(path.Join(dirname,
// fmt.Sprintf("%s_consensus.gml", consensus_id)))
// if err != nil {
// fmt.Println(err)
// } else {
// file.WriteString(graph.Gml())
// file.Close()
// }
// }
seq, err := graph.LongestConsensus(consensus_id, max_length)
sumCount := 0
if seq != nil {
for _, s := range seqs {
sumCount += s.Count()
}
@ -114,14 +173,60 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
seq.SetCount(sumCount)
seq.SetAttribute("seq_length", seq.Len())
seq.SetAttribute("kmer_size", kmer_size)
seq.SetAttribute("kmer_min_occur", threshold)
seq.SetAttribute("kmer_max_occur", graph.MaxLink())
//seq.SetAttribute("kmer_min_occur", threshold)
seq.SetAttribute("kmer_max_occur", graph.MaxWeight())
seq.SetAttribute("filtered_graph_size", graph.Len())
seq.SetAttribute("full_graph_size", total_kmer)
}
return seq, err
}
// func BuildConsensusWithTimeout(seqs obiseq.BioSequenceSlice,
// kmer_size int, quorum float64,
// min_depth float64,
// save_graph bool, dirname string, timeout time.Duration) (*obiseq.BioSequence, error) {
// ctx, cancel := context.WithTimeout(context.Background(), timeout)
// defer cancel()
// consensus := func() *obiseq.BioSequence {
// cons, err := BuildConsensus(seqs, kmer_size, quorum, min_depth, save_graph, dirname,)
// if err != nil {
// cons = nil
// }
// return cons
// }
// computation := func() <-chan *obiseq.BioSequence {
// result := make(chan *obiseq.BioSequence)
// go func() {
// select {
// case <-ctx.Done():
// result <- nil
// default:
// result <- consensus()
// }
// }()
// return result
// }
// calcResult := computation()
// select {
// case result := <-calcResult:
// if result == nil {
// return nil, fmt.Errorf("cannot compute consensus")
// }
// return result, nil
// case <-ctx.Done():
// return nil, fmt.Errorf("compute consensus timeout, exiting")
// }
// }
func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
newIter := obiiter.MakeIBioSequence()
size := 10
@ -153,10 +258,19 @@ func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
for iterator.Next() {
seqs := iterator.Get()
consensus, err := BuildConsensus(seqs.Slice(),
sequences := seqs.Slice()
id := sequences[0].Source()
if id == "" {
id = sequences[0].Id()
}
consensus, err := BuildConsensus(sequences,
id,
CLIKmerSize(), CLIThreshold(),
CLIKmerDepth(),
CLISaveGraphToFiles(), CLIGraphFilesDirectory(),
CLIMaxConsensusLength(),
CLISaveGraphToFiles(),
CLIGraphFilesDirectory(),
)
if err == nil {

View File

@ -9,6 +9,7 @@ var _saveGraph = "__@@NOSAVE@@__"
var _kmerSize = -1
var _threshold = 0.99
var _mindepth = -1.0
var _consensus_max_length = -1
func ObiconsensusOptionSet(options *getoptions.GetOpt) {
@ -38,6 +39,12 @@ func ObiconsensusOptionSet(options *getoptions.GetOpt) {
"Default value = -1, which means that the DEPTH is estimated from the data"),
)
options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length,
options.ArgName("LENGTH"),
options.Description("Maximum length of the consensus sequence. "+
"Default value = -1, which means that no limit is applied"),
)
}
func OptionSet(options *getoptions.GetOpt) {
@ -67,3 +74,7 @@ func CLIKmerDepth() float64 {
func CLIThreshold() float64 {
return _threshold
}
func CLIMaxConsensusLength() int {
return _consensus_max_length
}

View File

@ -0,0 +1,290 @@
package obiminion
import (
"fmt"
"os"
"sync"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obigraph"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiannotate"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiuniq"
"github.com/schollz/progressbar/v3"
log "github.com/sirupsen/logrus"
)
// SampleWeight calculates the weight of a sample based on the statistics of a sequence.
//
// Parameters:
// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice)
// - sample: the sample for which the weight is calculated (string)
// - sample_key: the key used to access the sample's statistics (string)
// Return type: a function that takes an integer index and returns the weight of the sample at that index (func(int) int)
func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func(int) float64 {
f := func(i int) float64 {
stats := (*seqs)[i].StatsOn(sample_key, "NA")
if value, ok := stats[sample]; ok {
return float64(value)
}
return 0
}
return f
}
// SeqBySamples sorts the sequences by samples.
//
// Parameters:
// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice)
// - sample_key: a string representing the sample key (string)
//
// Return type:
// - map[string]BioSequenceSlice: a map indexed by sample names, each containing a slice of BioSequence objects (map[string]BioSequenceSlice)
func SeqBySamples(seqs obiseq.BioSequenceSlice, sample_key string) map[string]*obiseq.BioSequenceSlice {
samples := make(map[string]*obiseq.BioSequenceSlice)
for _, s := range seqs {
if s.HasStatsOn(sample_key) {
stats := s.StatsOn(sample_key, "NA")
for k := range stats {
if seqset, ok := samples[k]; ok {
*seqset = append(*seqset, s)
samples[k] = seqset
} else {
samples[k] = &obiseq.BioSequenceSlice{s}
}
}
} else {
if k, ok := s.GetStringAttribute(sample_key); ok {
if seqset, ok := samples[k]; ok {
*seqset = append(*seqset, s)
samples[k] = seqset
} else {
samples[k] = &obiseq.BioSequenceSlice{s}
}
}
}
}
return samples
}
type Mutation struct {
Position int
SeqA byte
SeqB byte
Ratio float64
}
func BuildDiffSeqGraph(name, name_key string,
seqs *obiseq.BioSequenceSlice,
distmax, nworkers int) *obigraph.Graph[*obiseq.BioSequence, Mutation] {
graph := obigraph.NewGraphBuffer[*obiseq.BioSequence, Mutation](name, (*[]*obiseq.BioSequence)(seqs))
iseq := make(chan int)
defer graph.Close()
ls := len(*seqs)
sw := SampleWeight(seqs, name, name_key)
graph.Graph.VertexWeight = sw
waiting := sync.WaitGroup{}
waiting.Add(nworkers)
bar := (*progressbar.ProgressBar)(nil)
if obiconvert.CLIProgressBar() {
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription(fmt.Sprintf("[Build graph] on %s", name)),
)
bar = progressbar.NewOptions(len(*seqs), pbopt...)
}
computeEdges := func() {
defer waiting.Done()
for i := range iseq {
s1 := (*seqs)[i]
for j := i + 1; j < ls; j++ {
s2 := (*seqs)[j]
ratio := sw(i) / sw(j)
ok, pos, a1, a2 := obialign.D1Or0(s1, s2)
if ok >= 0 {
graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio})
} else if distmax > 1 {
lcs, lali := obialign.FastLCSScore(s1, s2, distmax, nil)
dist := lali - lcs
if lcs > 0 && dist <= distmax {
// log.Infof("Seq %s and %s: LCSScore: %d, dist: %d\n", s1.Id(), s2.Id(), lcs, dist)
graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio})
}
}
}
if bar != nil {
bar.Add(1)
}
}
}
for i := 0; i < nworkers; i++ {
go computeEdges()
}
for i := 0; i < ls; i++ {
iseq <- i
}
close(iseq)
waiting.Wait()
return graph.Graph
}
func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
sample_key string, kmer_size int, max_length int, threshold float64, depth float64) obiseq.BioSequenceSlice {
denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices))
for i, v := range *graph.Vertices {
var err error
var clean *obiseq.BioSequence
degree := graph.Degree(i)
if degree > 4 {
pack := obiseq.MakeBioSequenceSlice(degree + 1)
for k,j := range graph.Neighbors(i) {
pack[k] = (*graph.Vertices)[j]
}
pack[degree] = v
clean, err = obiconsensus.BuildConsensus(pack,
fmt.Sprintf("%s_consensus", v.Id()),
kmer_size,
threshold,
depth, max_length,
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
if err != nil {
log.Warning(err)
clean = (*graph.Vertices)[i]
clean.SetAttribute("obiminion_consensus", false)
} else {
clean.SetAttribute("obiminion_consensus", true)
}
pack.Recycle(false)
} else {
clean = obiseq.NewBioSequence(v.Id(), v.Sequence(), v.Definition())
clean.SetAttribute("obiminion_consensus", false)
}
clean.SetCount(int(graph.VertexWeight(i)))
clean.SetAttribute(sample_key, graph.Name)
denoised[i] = clean
}
return denoised
}
func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
dirname := CLIGraphFilesDirectory()
newIter := obiiter.MakeIBioSequence()
db := itertator.Load()
log.Infof("Sequence dataset of %d sequeences loaded\n", len(db))
samples := SeqBySamples(db, CLISampleAttribute())
db.Recycle(false)
log.Infof("Dataset composed of %d samples\n", len(samples))
if CLIMaxConsensusLength() > 0 {
log.Infof("Maximum consensus length: %d\n", CLIMaxConsensusLength())
}
log.Infof("Dataset composed of %d samples\n", len(samples))
if CLISaveGraphToFiles() {
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
// path does not exist or is not directory
os.RemoveAll(dirname)
err := os.Mkdir(dirname, 0755)
if err != nil {
log.Panicf("Cannot create directory %s for saving graphs", dirname)
}
}
}
bar := (*progressbar.ProgressBar)(nil)
if obiconvert.CLIProgressBar() {
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[Filter graph on abundance ratio]"),
)
bar = progressbar.NewOptions(len(samples), pbopt...)
}
newIter.Add(1)
go func() {
sample_order := 0
for sample, seqs := range samples {
graph := BuildDiffSeqGraph(sample,
CLISampleAttribute(),
seqs,
CLIDistStepMax(),
obioptions.CLIParallelWorkers())
if bar != nil {
bar.Add(1)
}
if CLISaveGraphToFiles() {
graph.WriteGmlFile(fmt.Sprintf("%s/%s.gml",
CLIGraphFilesDirectory(),
sample),
false, 1, 0, 3)
}
denoised := MinionDenoise(graph,
CLISampleAttribute(),
CLIKmerSize(),
CLIMaxConsensusLength(),
CLIThreshold(),
CLIKmerDepth())
newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised))
sample_order++
}
newIter.Done()
}()
go func() {
newIter.WaitAndClose()
}()
obiuniq.AddStatsOn(CLISampleAttribute())
obiuniq.SetUniqueInMemory(false)
obiuniq.SetNoSingleton(CLINoSingleton())
return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
}

View File

@ -0,0 +1,179 @@
package obiminion
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _distStepMax = 1
var _sampleAttribute = "sample"
var _ratioMax = 1.0
var _minEvalRate = 1000
var _clusterMode = false
var _onlyHead = false
var _kmerSize = -1
var _threshold = 1.0
var _mindepth = -1.0
var _consensus_max_length = 1000
var _NoSingleton = false
var _saveGraph = "__@@NOSAVE@@__"
var _saveRatio = "__@@NOSAVE@@__"
// ObiminionOptionSet sets the options for obiminion.
//
// options: The options for configuring obiminion.
func ObiminionOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
options.Alias("s"),
options.Description("Attribute containing sample descriptions (default %s)."))
options.IntVar(&_distStepMax, "distance", _distStepMax,
options.Alias("d"),
options.Description("Maximum numbers of differences between two variant sequences (default: %d)."))
options.IntVar(&_minEvalRate, "min-eval-rate", _minEvalRate,
options.Description("Minimum abundance of a sequence to be used to evaluate mutation rate."))
options.StringVar(&_saveGraph, "save-graph", _saveGraph,
options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+
"The graph files follow the graphml format."),
)
options.StringVar(&_saveRatio, "save-ratio", _saveRatio,
options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+
"The ratio file follows the csv format."),
)
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
options.ArgName("SIZE"),
options.Description("The size of the kmer used to build the consensus. "+
"Default value = -1, which means that the kmer size is estimated from the data"),
)
options.Float64Var(&_threshold, "threshold", _threshold,
options.ArgName("RATIO"),
options.Description("A threshold between O and 1 used to determine the optimal "+
"kmer size"),
)
options.Float64Var(&_mindepth, "min-depth", _mindepth,
options.ArgName("DEPTH"),
options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+
"reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+
"than 1, indicate the minimum count of occurrence for a kmer to be kept. "+
"Default value = -1, which means that the DEPTH is estimated from the data"),
)
options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length,
options.ArgName("LENGTH"),
options.Description("Maximum length of the consensus sequence. "+
"Default value = -1, which means that no limit is applied"),
)
options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton,
options.Description("If set, sequences occurring a single time in the data set are discarded."))
}
// OptionSet sets up the options for the obiminion package.
//
// It takes a pointer to a getoptions.GetOpt object as a parameter.
// It does not return any value.
func OptionSet(options *getoptions.GetOpt) {
obiconvert.InputOptionSet(options)
obiconvert.OutputOptionSet(options)
ObiminionOptionSet(options)
}
// CLIDistStepMax returns the maximum distance between two sequences.
//
// The value of the distance is set by the user with the `-d` flag.
//
// No parameters.
// Returns an integer.
func CLIDistStepMax() int {
return _distStepMax
}
// CLISampleAttribute returns the name of the attribute used to store sample name.
//
// The value of the sample attribute is set by the user with the `-s` flag.
//
// No parameters.
// Returns a string.
func CLISampleAttribute() string {
return _sampleAttribute
}
// > The function `CLIMinCountToEvalMutationRate()` returns the minimum number of reads that must be
// observed before the mutation rate can be evaluated
func CLIMinCountToEvalMutationRate() int {
return _minEvalRate
}
func ClusterMode() bool {
return _clusterMode
}
// `OnlyHead()` returns a boolean value that indicates whether the `-h` flag was passed to the program
func OnlyHead() bool {
return _onlyHead
}
// Returns true it the obliclean graphs must be saved
func CLISaveGraphToFiles() bool {
return _saveGraph != "__@@NOSAVE@@__"
}
// It returns the directory where the graph files are saved
func CLIGraphFilesDirectory() string {
return _saveGraph
}
// Returns true it the table of ratio must be saved
func IsSaveRatioTable() bool {
return _saveRatio != "__@@NOSAVE@@__"
}
// It returns the filename of the file that stores the ratio table
func RatioTableFilename() string {
return _saveRatio
}
// CLIKmerSize returns the value of the kmer size to use for building the consensus.
//
// The value of the kmer size is set by the user with the `-k` flag.
// The value -1 means that the kmer size is estimated as the minimum value that
// insure that no kmer are present more than one time in a sequence.
//
// No parameters.
// Returns an integer value.
func CLIKmerSize() int {
return _kmerSize
}
func CLIKmerDepth() float64 {
return _mindepth
}
func CLIThreshold() float64 {
return _threshold
}
func CLIMaxConsensusLength() int {
return _consensus_max_length
}
// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded.
//
// No parameters.
// Returns a boolean value indicating whether or not singleton sequences should be discarded.
func CLINoSingleton() bool {
return _NoSingleton
}

View File

@ -47,8 +47,8 @@ func CLIPCR(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error) {
frags := obiiter.IFragments(
CLIMaxLength()*1000,
CLIMaxLength()*100,
CLIMaxLength()+obiutils.MaxInt(len(CLIForwardPrimer()),
len(CLIReversePrimer()))+obiutils.MinInt(len(CLIForwardPrimer()),
CLIMaxLength()+obiutils.Max(len(CLIForwardPrimer()),
len(CLIReversePrimer()))+obiutils.Min(len(CLIForwardPrimer()),
len(CLIReversePrimer()))/2,
100,
obioptions.CLIParallelWorkers(),

View File

@ -63,7 +63,7 @@ func IndexSequence(seqidx int,
if lca[order] == ancestor {
// nseq[i]++
if mini != -1 {
wordmin = obiutils.MaxInt(sequence.Len(), references[order].Len()) - 3 - 4*mini
wordmin = obiutils.Max(sequence.Len(), references[order].Len()) - 3 - 4*mini
}
if cw[order] < wordmin {
@ -189,7 +189,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
indexed := obiiter.MakeIBioSequence()
go func() {
for i := 0; i < len(references); i += 10 {
limits <- [2]int{i, obiutils.MinInt(i+10, len(references))}
limits <- [2]int{i, obiutils.Min(i+10, len(references))}
}
close(limits)
}()

View File

@ -110,7 +110,7 @@ func FindClosests(sequence *obiseq.BioSequence,
d, _, _, _ := obialign.D1Or0(sequence, references[order])
if d >= 0 {
score = d
alilength = obiutils.MaxInt(sequence.Len(), ref.Len())
alilength = obiutils.Max(sequence.Len(), ref.Len())
lcs = alilength - score
}
} else {

View File

@ -12,6 +12,11 @@ var _chunks = 100
var _NAValue = "NA"
var _NoSingleton = false
// UniqueOptionSet sets up unique options for the obiuniq command.
//
// It configures various options such as merging attributes, category attributes,
// defining the NA value, handling singleton sequences, choosing between in-memory
// or disk storage, and specifying the chunk count for dataset division.
func UniqueOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_StatsOn, "merge",
1, 1,
@ -40,25 +45,67 @@ func UniqueOptionSet(options *getoptions.GetOpt) {
}
// OptionSet adds to the basic option set every options declared for
// the obiuniq command
//
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
UniqueOptionSet(options)
}
// CLIStatsOn returns the list of variables on witch statistics are computed.
//
// It does not take any parameters.
// It returns a slice of strings representing the statistics on values.
func CLIStatsOn() []string {
return _StatsOn
}
// SetStatsOn sets the list of variables on witch statistics are computed.
//
// It takes a slice of strings as its parameter and does not return anything.
func SetStatsOn(statsOn []string) {
_StatsOn = statsOn
}
// AddStatsOn adds a variable to the list of variables on witch statistics are computed.
//
// Parameters:
// - statsOn: variadic strings representing the statistics to be added.
func AddStatsOn(statsOn ...string) {
_StatsOn = append(_StatsOn, statsOn...)
}
// CLIKeys returns the keys used to distinguished among identical sequences.
//
// It does not take any parameters.
// It returns a slice of strings representing the keys used by the CLI.
func CLIKeys() []string {
return _Keys
}
// CLIUniqueInMemory returns if the unique function is running in memory only.
//
// It does not take any parameters.
// It returns a boolean value indicating whether the function is running in memory or not.
func CLIUniqueInMemory() bool {
return _InMemory
}
// SetUniqueInMemory sets whether the unique function is running in memory or not.
//
// inMemory bool - A boolean value indicating whether the function is running in memory.
// No return value.
func SetUniqueInMemory(inMemory bool) {
_InMemory = inMemory
}
// CLINumberOfChunks returns the number of chunks used for the first bucket sort step used by the unique function.
//
// It does not take any parameters.
// It returns an integer representing the number of chunks.
func CLINumberOfChunks() int {
if _chunks <= 1 {
return 1
@ -67,10 +114,40 @@ func CLINumberOfChunks() int {
return _chunks
}
// SetNumberOfChunks sets the number of chunks used for the first bucket sort step used by the unique function.
//
// chunks int - The number of chunks to be set.
// No return value.
func SetNumberOfChunks(chunks int) {
_chunks = chunks
}
// CLINAValue returns the value used as a placeholder for missing values.
//
// No parameters.
// Return type: string.
func CLINAValue() string {
return _NAValue
}
// SetNAValue sets the NA value to the specified string.
//
// value string - The value to set as the NA value.
func SetNAValue(value string) {
_NAValue = value
}
// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded.
//
// No parameters.
// Returns a boolean value indicating whether or not singleton sequences should be discarded.
func CLINoSingleton() bool {
return _NoSingleton
}
// SetNoSingleton sets the boolean value indicating whether or not singleton sequences should be discarded.
//
// noSingleton bool - The boolean value to set for _NoSingleton.
func SetNoSingleton(noSingleton bool) {
_NoSingleton = noSingleton
}

View File

@ -8,7 +8,7 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
)
func Unique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
func CLIUnique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
options := make([]obichunk.WithOption, 0, 30)

View File

@ -1,69 +1,32 @@
package obiutils
import "golang.org/x/exp/constraints"
import (
"golang.org/x/exp/constraints"
)
func MinInt(x, y int) int {
func Min[T constraints.Ordered](x, y T) T {
if x < y {
return x
}
return y
}
func MaxInt(x, y int) int {
func Max[T constraints.Ordered](x, y T) T {
if x < y {
return y
}
return x
}
func MinMaxInt(x, y int) (int, int) {
func MinMax[T constraints.Ordered](x, y T) (T, T) {
if x < y {
return x, y
}
return y, x
}
func MinUInt16(x, y uint16) uint16 {
if x < y {
return x
}
return y
}
func MaxUInt16(x, y uint16) uint16 {
if x < y {
return y
}
return x
}
func MinSlice[T constraints.Ordered](vec []T) T {
if len(vec) == 0 {
panic("empty slice")
}
min := vec[0]
for _, v := range vec {
if v < min {
min = v
}
}
return min
}
func MaxSlice[T constraints.Ordered](vec []T) T {
if len(vec) == 0 {
panic("empty slice")
}
max := vec[0]
for _, v := range vec {
if v > max {
max = v
}
}
return max
}
func RangeSlice[T constraints.Ordered](vec []T) (min, max T) {
func MinMaxSlice[T constraints.Ordered](vec []T) (min, max T) {
if len(vec) == 0 {
panic("empty slice")
}

View File

@ -13,8 +13,8 @@ import (
// Zero is a zero-valued uint128.
var Zero Uint128
// Max is the largest possible uint128 value.
var Max = New(math.MaxUint64, math.MaxUint64)
// MaxUint128 is the largest possible uint128 value.
var MaxUint128 = New(math.MaxUint64, math.MaxUint64)
// A Uint128 is an unsigned 128-bit number.
type Uint128 struct {