diff --git a/cmd/obitools/obiminion/main.go b/cmd/obitools/obiminion/main.go new file mode 100644 index 0000000..ad74cda --- /dev/null +++ b/cmd/obitools/obiminion/main.go @@ -0,0 +1,33 @@ +package main + +import ( + "os" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiminion" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" +) + +func main() { + optionParser := obioptions.GenerateOptionParser(obiminion.OptionSet) + + _, args := optionParser(os.Args) + + fs, err := obiconvert.CLIReadBioSequences(args...) + + if err != nil { + log.Errorf("Cannot open file (%v)", err) + os.Exit(1) + } + + cleaned := obiminion.CLIOBIMinion(fs) + + obiconvert.CLIWriteBioSequences(cleaned, true) + + obiiter.WaitForLastPipe() + +} diff --git a/cmd/obitools/obiuniq/main.go b/cmd/obitools/obiuniq/main.go index dbc0ba1..b3dcf1a 100644 --- a/cmd/obitools/obiuniq/main.go +++ b/cmd/obitools/obiuniq/main.go @@ -43,7 +43,7 @@ func main() { os.Exit(1) } - unique := obiuniq.Unique(sequences) + unique := obiuniq.CLIUnique(sequences) obiconvert.CLIWriteBioSequences(unique, true) obiiter.WaitForLastPipe() diff --git a/pkg/obialign/fastlcs.go b/pkg/obialign/fastlcs.go index 789ad87..075d4a0 100644 --- a/pkg/obialign/fastlcs.go +++ b/pkg/obialign/fastlcs.go @@ -58,182 +58,3 @@ var _empty = encodeValues(0, 0, false) var _out = encodeValues(0, 30000, true) var _notavail = encodeValues(0, 30000, false) -// func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { - -// lA := seqA.Len() -// lB := seqB.Len() - -// // Ensure that A is the longest -// if lA < lB { -// seqA, seqB = seqB, seqA -// lA, lB = lB, lA -// } - -// if maxError == -1 { -// maxError = lA * 2 -// } - -// delta := lA - lB - -// // The difference of length is larger the maximum allowed errors -// if delta > maxError { -// return -1, -1 -// } - -// // Doit-on vraiment diviser par deux ??? pas certain -// extra := (maxError - delta) + 1 - -// even := 1 + delta + 2*extra -// width := 2*even - 1 - -// if buffer == nil { -// var local []uint64 -// buffer = &local -// } - -// if cap(*buffer) < 2*width { -// *buffer = make([]uint64, 3*width) -// } - -// previous := (*buffer)[0:width] -// current := (*buffer)[width:(2 * width)] - -// previous[extra] = _empty -// previous[extra+even] = encodeValues(0, 1, false) -// previous[extra+even-1] = encodeValues(0, 1, false) - -// N := lB + ((delta) >> 1) - -// bA := seqA.Sequence() -// bB := seqB.Sequence() - -// // log.Println("N = ", N) - -// for y := 1; y <= N; y++ { -// // in_matrix := false -// x1 := y - lB + extra -// x2 := extra - y -// xs := obiutils.MaxInt(obiutils.MaxInt(x1, x2), 0) - -// x1 = y + extra -// x2 = lA + extra - y -// xf := obiutils.MinInt(obiutils.MinInt(x1, x2), even-1) + 1 - -// for x := xs; x < xf; x++ { - -// i := y - x + extra -// j := y + x - extra - -// var Sdiag, Sleft, Sup uint64 - -// switch { - -// case i == 0: -// Sup = _notavail -// Sdiag = _notavail -// Sleft = encodeValues(0, j-1, false) -// case j == 0: -// Sup = encodeValues(0, i-1, false) -// Sdiag = _notavail -// Sleft = _notavail -// default: -// Sdiag = previous[x] - -// if bA[j-1] == bB[i-1] { -// Sdiag = _incscore(Sdiag) -// } - -// if x < (even - 1) { -// Sup = previous[x+even] -// } else { -// Sup = _out -// } -// if x > 0 { -// Sleft = previous[x+even-1] -// } else { -// Sleft = _out -// } -// } - -// var score uint64 -// switch { -// case Sdiag >= Sup && Sdiag >= Sleft: -// score = Sdiag -// case Sup >= Sleft: -// score = Sup -// default: -// score = Sleft -// } - -// if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) { -// score = _setout(score) -// } - -// current[x] = _incpath(score) -// } -// // . 9 10 + 2 - 1 -// x1 = y - lB + extra + even -// x2 = extra - y + even - 1 -// xs = obiutils.MaxInt(obiutils.MaxInt(x1, x2), even) - -// x1 = y + extra + even -// x2 = lA + extra - y + even - 1 -// xf = obiutils.MinInt(obiutils.MinInt(x1, x2), width-1) + 1 - -// for x := xs; x < xf; x++ { - -// i := y - x + extra + even -// j := y + x - extra - even + 1 - -// var Sdiag, Sleft, Sup uint64 - -// switch { - -// case i == 0: -// Sup = _notavail -// Sdiag = _notavail -// Sleft = encodeValues(0, j-1, false) -// case j == 0: -// Sup = encodeValues(0, i-1, false) -// Sdiag = _notavail -// Sleft = _notavail -// default: -// Sdiag = previous[x] -// if bA[j-1] == bB[i-1] { -// Sdiag = _incscore(Sdiag) -// } - -// Sleft = current[x-even] -// Sup = current[x-even+1] - -// } - -// var score uint64 -// switch { -// case Sdiag >= Sup && Sdiag >= Sleft: -// score = Sdiag -// case Sup >= Sleft: -// score = Sup -// default: -// score = Sleft -// } - -// if _isout(Sdiag) || _isout(Sup) || _isout(Sleft) { -// score = _setout(score) -// } - -// current[x] = _incpath(score) -// } - -// previous, current = current, previous - -// } - -// s, l, o := decodeValues(previous[(delta%2)*even+extra+(delta>>1)]) - -// if o { -// return -1, -1 -// } - -// return s, l -// } diff --git a/pkg/obialign/fastlcsegf.go b/pkg/obialign/fastlcsegf.go index e339229..152238b 100644 --- a/pkg/obialign/fastlcsegf.go +++ b/pkg/obialign/fastlcsegf.go @@ -130,11 +130,11 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[ // in_matrix := false x1 := y - lB + extra x2 := extra - y - xs := obiutils.MaxInt(obiutils.MaxInt(x1, x2), 0) + xs := obiutils.Max(obiutils.Max(x1, x2), 0) x1 = y + extra x2 = lA + extra - y - xf := obiutils.MinInt(obiutils.MinInt(x1, x2), even-1) + 1 + xf := obiutils.Min(obiutils.Min(x1, x2), even-1) + 1 for x := xs; x < xf; x++ { @@ -222,11 +222,11 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[ // . 9 10 + 2 - 1 x1 = y - lB + extra + even x2 = extra - y + even - 1 - xs = obiutils.MaxInt(obiutils.MaxInt(x1, x2), even) + xs = obiutils.Max(obiutils.Max(x1, x2), even) x1 = y + extra + even x2 = lA + extra - y + even - 1 - xf = obiutils.MinInt(obiutils.MinInt(x1, x2), width-1) + 1 + xf = obiutils.Min(obiutils.Min(x1, x2), width-1) + 1 for x := xs; x < xf; x++ { @@ -348,16 +348,15 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[ // - Matching: 1 // - Mismatch or gap: 0 // -// Compared to FastLCSScoreByte the length of the shortest alignment returned does not include the end-gaps. +// Parameters: +// - seqA: The first bio sequence. +// - seqB: The second bio sequence. +// - maxError: The maximum allowed error between the sequences. If set to -1, no limit is applied. +// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created. // -// if buffer != nil, the buffer is used to store intermediate results. -// Otherwise, a new buffer is allocated. -// -// seqA: The first bio sequence. -// seqB: The second bio sequence. -// maxError: The maximum allowed error between the sequences. -// buffer: A buffer to store intermediate results. -// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding. +// Returns: +// - The score of the longest common subsequence. +// - The length of the shortest alignment corresponding to the LCS. func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer) } @@ -372,14 +371,16 @@ func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uin // - Matching: 1 // - Mismatch or gap: 0 // -// if buffer != nil, the buffer is used to store intermediate results. -// Otherwise, a new buffer is allocated. +// Parameters: +// - seqA: The first bio sequence. +// - seqB: The second bio sequence. +// - maxError: The maximum allowed error between the sequences. If set to -1, no limit is applied. +// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created. // -// seqA: The first bio sequence. -// seqB: The second bio sequence. -// maxError: The maximum allowed error between the sequences. -// buffer: A buffer to store intermediate results. -// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding. +// Returns: +// - The score of the longest common subsequence. +// - The length of the shortest alignment corresponding to the LCS. func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer) } + diff --git a/pkg/obiapat/pattern.go b/pkg/obiapat/pattern.go index 6b6526b..7ee9555 100644 --- a/pkg/obiapat/pattern.go +++ b/pkg/obiapat/pattern.go @@ -348,8 +348,8 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) ( start = best[0] - nerr end = best[0] + int(pattern.pointer.pointer.patlen) + nerr - start = obiutils.MaxInt(start, 0) - end = obiutils.MinInt(end, sequence.Len()) + start = obiutils.Max(start, 0) + end = obiutils.Min(end, sequence.Len()) cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat)) frg := sequence.pointer.reference.Sequence()[start:end] @@ -387,8 +387,8 @@ func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int) if m[2] > 0 && pattern.pointer.pointer.hasIndel { start := m[0] - m[2] end := m[0] + int(pattern.pointer.pointer.patlen) + m[2] - start = obiutils.MaxInt(start, 0) - end = obiutils.MinInt(end, sequence.Len()) + start = obiutils.Max(start, 0) + end = obiutils.Min(end, sequence.Len()) cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat)) frg := sequence.pointer.reference.Sequence()[start:end] diff --git a/pkg/obigraph/graph.go b/pkg/obigraph/graph.go new file mode 100644 index 0000000..2ac5ef9 --- /dev/null +++ b/pkg/obigraph/graph.go @@ -0,0 +1,327 @@ +package obigraph + +import ( + "bytes" + "fmt" + "io" + "math" + "os" + "text/template" + + log "github.com/sirupsen/logrus" +) + +type Edge[T any] struct { + From int + To int + Data *T +} + +type Edges[T any] map[int]map[int]*T +type Graph[V any, T any] struct { + Name string + Vertices *[]V + Edges *Edges[T] + ReverseEdges *Edges[T] + VertexWeight func(int) float64 + VertexId func(int) string + EdgeWeight func(int, int) float64 +} + +func NewEdges[T any]() *Edges[T] { + e := make(map[int]map[int]*T) + + return (*Edges[T])(&e) +} + +// AddEdge adds an edge to the graph between two vertices. +// +// Parameters: +// - from: the index of the starting vertex. +// - to: the index of the ending vertex. +// - data: a pointer to the data associated with the edge. +func (e *Edges[T]) AddEdge(from, to int, data *T) { + + fnode, ok := (*e)[from] + if !ok { + fnode = make(map[int]*T) + (*e)[from] = fnode + } + fnode[to] = data +} + +// NewGraph creates a new graph with the specified name and vertices. +// +// Parameters: +// - name: a string representing the name of the graph. +// - vertices: a slice of vertices of type V. +// +// Returns: +// - Graph[V, T]: the newly created graph. +func NewGraph[V, T any](name string, vertices *[]V) *Graph[V, T] { + return &Graph[V, T]{ + Name: name, + Vertices: vertices, + Edges: NewEdges[T](), + ReverseEdges: NewEdges[T](), + VertexWeight: func(i int) float64 { + return 1.0 + }, + EdgeWeight: func(i, j int) float64 { + return 1.0 + }, + VertexId: func(i int) string { + return fmt.Sprintf("V%d", i) + }, + } +} + +// AddEdge adds an edge between two vertices in the graph. +// +// Parameters: +// - from: the index of the starting vertex. +// - to: the index of the ending vertex. +// - data: a pointer to the data associated with the edge. +func (g *Graph[V, T]) AddEdge(from, to int, data *T) { + lv := len(*g.Vertices) + if from >= lv || to >= lv { + log.Errorf("out of bounds vertex index: %d or %d (max: %d)", from, to, lv-1) + } + + g.Edges.AddEdge(from, to, data) + g.Edges.AddEdge(to, from, data) + g.ReverseEdges.AddEdge(to, from, data) + g.ReverseEdges.AddEdge(from, to, data) +} + +// AddDirectedEdge adds a directed edge from one vertex to another in the graph. +// +// Parameters: +// - from: an integer representing the index of the starting vertex. +// - to: an integer representing the index of the ending vertex. +// - data: a pointer to the data associated with the edge. +func (g *Graph[V, T]) AddDirectedEdge(from, to int, data *T) { + lv := len(*g.Vertices) + + if from >= lv || to >= lv { + log.Errorf("out of bounds vertex index: %d or %d (max: %d)", from, to, lv-1) + } + + g.Edges.AddEdge(from, to, data) + g.ReverseEdges.AddEdge(to, from, data) +} + +// SetAsDirectedEdge sets the edge from one vertex to another as directed in the graph. +// +// Parameters: +// - from: an integer representing the index of the starting vertex. +// - to: an integer representing the index of the ending vertex. +func (g *Graph[V, T]) SetAsDirectedEdge(from, to int) { + lv := len(*g.Vertices) + + if from >= lv || to >= lv { + log.Errorf("out of bounds vertex index: %d or %d (max: %d)", from, to, lv-1) + } + + if _, ok := (*g.Edges)[from][to]; ok { + if _, ok := (*g.Edges)[to][from]; ok { + delete((*g.Edges)[to], from) + delete((*g.Edges)[from], to) + } + + return + } + + log.Error("no edge from ", from, " to ", to) + +} + +// Neighbors generates a list of neighbor vertices for a given vertex index in the graph. +// +// Parameters: +// - v: an integer representing the index of the vertex. +// Returns: +// - []int: a list of neighbor vertices. +func (g *Graph[V, T]) Neighbors(v int) []int { + if neighbors, ok := (*g.Edges)[v]; ok { + rep := make([]int, 0, len(neighbors)) + + for k := range neighbors { + rep = append(rep, k) + } + + return rep + } + + return nil + +} + +// Degree calculates the degree of a vertex in a graph. +// +// Parameters: +// - v: an integer representing the index of the vertex. +// +// Returns: +// - an integer representing the degree of the vertex. +func (g *Graph[V, T]) Degree(v int) int { + if neighbors, ok := (*g.Edges)[v]; ok { + return len(neighbors) + } + return 0 +} + +// Parents returns a list of parent vertices for a given vertex index in the graph. +// +// Parameters: +// - v: an integer representing the index of the vertex. +// +// Returns: +// - []int: a list of parent vertices. +func (g *Graph[V, T]) Parents(v int) []int { + if parents, ok := (*g.ReverseEdges)[v]; ok { + rep := make([]int, 0, len(parents)) + + for k := range parents { + rep = append(rep, k) + } + + return rep + } + + return nil +} + +// ParentDegree calculates the degree of a vertex in a graph by counting the number of its parent vertices. +// +// Parameters: +// - v: an integer representing the index of the vertex. +// +// Returns: +// - an integer representing the degree of the vertex. +func (g *Graph[V, T]) ParentDegree(v int) int { + if parents, ok := (*g.ReverseEdges)[v]; ok { + return len(parents) + } + return 0 + +} + +type gml_graph[V any, T any] struct { + Graph *Graph[V, T] + As_directed bool + Min_degree int + Threshold float64 + Scale int +} + +// Gml generates a GML representation of the graph. +// +// as_directed: whether the graph should be treated as directed or undirected. +// threshold: the threshold value. +// scale: the scaling factor. +// string: the GML representation of the graph. +func (g *Graph[V, T]) Gml(as_directed bool, min_degree int, threshold float64, scale int) string { + // (*seqs)[1].Count + var gml bytes.Buffer + + data := gml_graph[V, T]{ + Graph: g, + As_directed: as_directed, + Min_degree: min_degree, + Threshold: threshold, + Scale: scale, + } + + digraphTpl := template.New("gml_digraph") + + digraph := ` {{$context := .}} + graph [ + comment "{{ if $context.As_directed }}Directed graph{{ else }}Undirected graph{{ end }} {{ Name }}" + directed {{ if $context.As_directed }}1{{ else }}0{{ end }} + + {{range $index, $data:= $context.Graph.Vertices}} + {{ if (ge (Degree $index) $context.Min_degree)}} + node [ + id {{$index}} + graphics [ + type "{{ Shape $index }}" + h {{ Sqrt (VertexWeight $index) }} + w {{ Sqrt (VertexWeight $index) }} + ] + ] + {{ end }} + {{ end }} + + {{range $source, $data:= $context.Graph.Edges}} + {{range $target, $edge:= $data}} + {{ if and (ge $source $context.Min_degree) (ge $target $context.Min_degree) (or $context.As_directed (lt $source $target))}} + edge [ source {{$source}} + target {{$target}} + color "#00FF00" + ] + {{ end }} + {{ end }} + {{ end }} + + ] + ` + + tmpl, err := digraphTpl.Funcs(template.FuncMap{ + "Sqrt": func(i float64) int { return scale * int(math.Floor(math.Sqrt(i))) }, + "Name": func() string { return g.Name }, + "VertexId": func(i int) string { return g.VertexId(i) }, + "Degree": func(i int) int { return g.Degree(i) }, + "VertexWeight": func(i int) float64 { return g.VertexWeight(i) }, + "Shape": func(i int) string { + if g.VertexWeight(i) >= threshold { + return "circle" + } else { + return "rectangle" + } + }, + }).Parse(digraph) + + if err != nil { + panic(err) + } + + err = tmpl.Execute(&gml, data) + + if err != nil { + panic(err) + } + + return gml.String() + +} + +// WriteGml writes the GML representation of the graph to an io.Writer. +// +// w: the io.Writer to write the GML representation to. +// as_directed: whether the graph should be treated as directed or undirected. +// threshold: the threshold value. +// scale: the scaling factor. +func (g *Graph[V, T]) WriteGml(w io.Writer, as_directed bool, min_degree int, threshold float64, scale int) { + + _, err := w.Write([]byte(g.Gml(as_directed, min_degree, threshold, scale))) + if err != nil { + panic(err) + } +} + +// WriteGmlFile writes the graph in GML format to the specified file. +// +// filename: the name of the file to write the GML representation to. +// as_directed: whether the graph should be treated as directed or undirected. +// threshold: the threshold value. +// scale: the scaling factor. +func (g *Graph[V, T]) WriteGmlFile(filename string, as_directed bool, min_degree int, threshold float64, scale int) { + + f, err := os.Create(filename) + if err != nil { + panic(err) + } + defer f.Close() + g.WriteGml(f, as_directed, min_degree, threshold, scale) +} diff --git a/pkg/obigraph/graphbuffer.go b/pkg/obigraph/graphbuffer.go new file mode 100644 index 0000000..3368cb3 --- /dev/null +++ b/pkg/obigraph/graphbuffer.go @@ -0,0 +1,104 @@ +package obigraph + +import ( + "io" + "os" +) + +type GraphBuffer[V, T any] struct { + Graph *Graph[V, T] + Channel chan Edge[T] +} + +// NewGraphBuffer creates a new GraphBuffer with the given name and vertices. +// +// Parameters: +// - name: the name of the GraphBuffer. +// - vertices: a slice of vertices to initialize the GraphBuffer. +// +// Returns: +// - GraphBuffer[V, T]: the newly created GraphBuffer. +func NewGraphBuffer[V, T any](name string, vertices *[]V) *GraphBuffer[V, T] { + buffer := GraphBuffer[V, T]{ + Graph: NewGraph[V, T](name, vertices), + Channel: make(chan Edge[T]), + } + + go func() { + for edge := range buffer.Channel { + buffer.Graph.AddEdge(edge.From, edge.To, edge.Data) + } + }() + + return &buffer +} + +// AddEdge adds an edge to the GraphBuffer. +// +// Parameters: +// - from: the index of the starting vertex. +// - to: the index of the ending vertex. +// - data: a pointer to the data associated with the edge. +func (g *GraphBuffer[V, T]) AddEdge(from, to int, data *T) { + g.Channel <- Edge[T]{ + From: from, + To: to, + Data: data, + } +} + +// AddDirectedEdge adds a directed edge from one vertex to another in the GraphBuffer. +// +// Parameters: +// - from: the index of the starting vertex. +// - to: the index of the ending vertex. +// - data: a pointer to the data associated with the edge. +func (g *GraphBuffer[V, T]) AddDirectedEdge(from, to int, data *T) { + g.Channel <- Edge[T]{ + From: from, + To: to, + Data: data, + } +} + +// Gml generates a GML representation of the graph. +// +// as_directed: whether the graph should be treated as directed or undirected. +// min_degree: the minimum degree of vertices to include in the GML representation. +// threshold: the threshold value. +// scale: the scaling factor. +// string: the GML representation of the graph. +func (g *GraphBuffer[V, T]) Gml(as_directed bool, min_degree int, threshold float64, scale int) string { + return g.Graph.Gml(as_directed, min_degree, threshold, scale) +} + +func (g *GraphBuffer[V, T]) WriteGmlFile(filename string, as_directed bool, min_degree int, threshold float64, scale int) { + + f, err := os.Create(filename) + if err != nil { + panic(err) + } + defer f.Close() + g.WriteGml(f, as_directed, min_degree, threshold, scale) +} + +// WriteGml writes the GML representation of the graph to an io.Writer. +// +// w: the io.Writer to write the GML representation to. +// as_directed: whether the graph should be treated as directed or undirected. +// min_degree: the minimum degree of vertices to include in the GML representation. +// threshold: the threshold value. +// scale: the scaling factor. +func (g *GraphBuffer[V, T]) WriteGml(w io.Writer, as_directed bool, min_degree int, threshold float64, scale int) { + _, err := w.Write([]byte(g.Gml(as_directed, min_degree, threshold, scale))) + if err != nil { + panic(err) + } +} + +// Close closes the GraphBuffer by closing its channel. +// +// No parameters. +func (g *GraphBuffer[V, T]) Close() { + close(g.Channel) +} diff --git a/pkg/obiiter/fragment.go b/pkg/obiiter/fragment.go index a673ce7..7abdd6b 100644 --- a/pkg/obiiter/fragment.go +++ b/pkg/obiiter/fragment.go @@ -30,7 +30,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable { news = append(news, s) } else { for i := 0; i < s.Len(); i += step { - end := obiutils.MinInt(i+length, s.Len()) + end := obiutils.Min(i+length, s.Len()) fusion := false if (s.Len() - end) < step { end = s.Len() diff --git a/pkg/obikmer/counting.go b/pkg/obikmer/counting.go index bb5f22a..5ce8d31 100644 --- a/pkg/obikmer/counting.go +++ b/pkg/obikmer/counting.go @@ -9,9 +9,10 @@ import ( type Table4mer [256]uint16 -func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer { - iternal_buffer := Encode4mer(seq, buffer) + +func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer { + iternal_buffer := Encode4mer(seq, buffer) // The slice of 4-mer codes if counts == nil { var w Table4mer @@ -19,7 +20,7 @@ func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Tabl } // Every cells of the counter is set to zero - for i := 0; i < 256; i++ { + for i := 0; i < 256; i++ { // 256 is the number of possible 4-mer codes (*counts)[i] = 0 } @@ -32,7 +33,7 @@ func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Tabl func Common4Mer(count1, count2 *Table4mer) int { sum := 0 for i := 0; i < 256; i++ { - sum += int(obiutils.MinUInt16((*count1)[i], (*count2)[i])) + sum += int(obiutils.Min((*count1)[i], (*count2)[i])) } return sum } @@ -48,7 +49,7 @@ func Sum4Mer(count *Table4mer) int { func LCS4MerBounds(count1, count2 *Table4mer) (int, int) { s1 := Sum4Mer(count1) s2 := Sum4Mer(count2) - smin := obiutils.MinInt(s1, s2) + smin := obiutils.Min(s1, s2) cw := Common4Mer(count1, count2) @@ -65,7 +66,7 @@ func LCS4MerBounds(count1, count2 *Table4mer) (int, int) { func Error4MerBounds(count1, count2 *Table4mer) (int, int) { s1 := Sum4Mer(count1) s2 := Sum4Mer(count2) - smax := obiutils.MaxInt(s1, s2) + smax := obiutils.Max(s1, s2) cw := Common4Mer(count1, count2) diff --git a/pkg/obikmer/debruijn.go b/pkg/obikmer/debruijn.go index 1436d34..1df3b6a 100644 --- a/pkg/obikmer/debruijn.go +++ b/pkg/obikmer/debruijn.go @@ -2,13 +2,16 @@ package obikmer import ( "bytes" + "container/heap" "fmt" "math" "math/bits" + "slices" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "github.com/daichi-m/go18ds/sets/linkedhashset" "github.com/daichi-m/go18ds/stacks/arraystack" + log "github.com/sirupsen/logrus" ) type KmerIdx32 uint32 @@ -49,36 +52,60 @@ type KmerIdx_t interface { } type DeBruijnGraph struct { - kmersize int - kmermask uint64 - prevc uint64 + kmersize int // k-mer size + kmermask uint64 // mask used to set to 0 the bits that are not in the k-mer + prevc uint64 // prevg uint64 prevt uint64 - graph map[uint64]uint + graph map[uint64]uint // Kmer are encoded as uint64 with 2 bits per character } +// MakeDeBruijnGraph creates a De Bruijn Graph with the specified k-mer size. +// +// Parameters: +// +// kmersize int - the size of the k-mers +// +// Returns: +// +// *DeBruijnGraph - a pointer to the created De Bruijn's Graph func MakeDeBruijnGraph(kmersize int) *DeBruijnGraph { g := DeBruijnGraph{ kmersize: kmersize, - kmermask: ^(^uint64(0) << (uint64(kmersize+1) * 2)), - prevc: uint64(1) << (uint64(kmersize) * 2), - prevg: uint64(2) << (uint64(kmersize) * 2), - prevt: uint64(3) << (uint64(kmersize) * 2), + kmermask: ^(^uint64(0) << (uint64(kmersize) * 2)), // k-mer mask used to set to 0 the bits that are not in the k-mer + prevc: uint64(1) << (uint64(kmersize-1) * 2), + prevg: uint64(2) << (uint64(kmersize-1) * 2), + prevt: uint64(3) << (uint64(kmersize-1) * 2), graph: make(map[uint64]uint), } return &g } +// KmerSize returns the size of the k-mers in the DeBruijn graph. +// +// This function takes no parameters. +// It returns an integer representing the size of the k-mers. func (g *DeBruijnGraph) KmerSize() int { return g.kmersize } +// Len returns the length of the graph. +// +// This function takes no parameters. +// It returns an integer representing the number of nodes in the graph. func (g *DeBruijnGraph) Len() int { return len(g.graph) } -func (g *DeBruijnGraph) MaxLink() int { +// MaxWeight returns the maximum weight of a node from the DeBruijn's Graph. +// +// It iterates over each count in the graph map and updates the max value if the current count is greater. +// Finally, it returns the maximum weight as an integer. +// +// Returns: +// - int: the maximum weight value. +func (g *DeBruijnGraph) MaxWeight() int { max := uint(0) for _, count := range g.graph { if count > max { @@ -89,8 +116,12 @@ func (g *DeBruijnGraph) MaxLink() int { return int(max) } -func (g *DeBruijnGraph) LinkSpectrum() []int { - max := g.MaxLink() +// WeightSpectrum calculates the weight spectrum of nodes in the DeBruijn's graph. +// +// No parameters. +// Returns an array of integers representing the weight spectrum. +func (g *DeBruijnGraph) WeightSpectrum() []int { + max := g.MaxWeight() spectrum := make([]int, max+1) for _, count := range g.graph { spectrum[int(count)]++ @@ -99,7 +130,10 @@ func (g *DeBruijnGraph) LinkSpectrum() []int { return spectrum } -func (g *DeBruijnGraph) FilterMin(min int) { +// FilterMinWeight filters the DeBruijnGraph by removing nodes with weight less than the specified minimum. +// +// min: an integer representing the minimum count threshold. +func (g *DeBruijnGraph) FilterMinWeight(min int) { umin := uint(min) for idx, count := range g.graph { if count < umin { @@ -109,8 +143,12 @@ func (g *DeBruijnGraph) FilterMin(min int) { } func (g *DeBruijnGraph) Previouses(index uint64) []uint64 { + if _, ok := g.graph[index]; !ok { + log.Panicf("k-mer %s (index %d) is not in graph", g.DecodeNode(index), index) + } + rep := make([]uint64, 0, 4) - index = index >> 2 + index >>= 2 if _, ok := g.graph[index]; ok { rep = append(rep, index) @@ -135,6 +173,10 @@ func (g *DeBruijnGraph) Previouses(index uint64) []uint64 { } func (g *DeBruijnGraph) Nexts(index uint64) []uint64 { + if _, ok := g.graph[index]; !ok { + log.Panicf("k-mer %s (index %d) is not in graph", g.DecodeNode(index), index) + } + rep := make([]uint64, 0, 4) index = (index << 2) & g.kmermask @@ -160,11 +202,11 @@ func (g *DeBruijnGraph) Nexts(index uint64) []uint64 { return rep } -func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) { +func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, int, bool) { ns := g.Nexts(index) if len(ns) == 0 { - return uint64(0), false + return uint64(0), 0, false } max := uint(0) @@ -177,7 +219,34 @@ func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) { } } - return rep, true + return rep, int(max), true +} + +func (g *DeBruijnGraph) Heads() []uint64 { + rep := make([]uint64, 0, 10) + + for k := range g.graph { + if len(g.Previouses(k)) == 0 { + rep = append(rep, k) + } + } + + return rep +} + +func (g *DeBruijnGraph) MaxHead() (uint64, int, bool) { + rep := uint64(0) + max := uint(0) + found := false + for k, w := range g.graph { + if len(g.Previouses(k)) == 0 && w > max { + rep = k + max = w + found = true + } + } + + return rep, int(max), found } func (g *DeBruijnGraph) MaxPath() []uint64 { @@ -185,17 +254,17 @@ func (g *DeBruijnGraph) MaxPath() []uint64 { ok := false idx := uint64(0) - idx, ok = g.MaxHead() + idx, _, ok = g.MaxHead() for ok { path = append(path, idx) - idx, ok = g.MaxNext(idx) + idx, _, ok = g.MaxNext(idx) } return path } -func (g *DeBruijnGraph) LongestPath() []uint64 { +func (g *DeBruijnGraph) LongestPath(max_length int) []uint64 { var path []uint64 wmax := uint(0) ok := true @@ -209,7 +278,11 @@ func (g *DeBruijnGraph) LongestPath() []uint64 { nw := g.graph[idx] w += nw lp = append(lp, idx) - idx, ok = g.MaxNext(idx) + idx, _, ok = g.MaxNext(idx) + if max_length > 0 && len(lp) > max_length { + ok = false + w = 0 + } } if w > wmax { @@ -221,8 +294,9 @@ func (g *DeBruijnGraph) LongestPath() []uint64 { return path } -func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error) { - path := g.LongestPath() +func (g *DeBruijnGraph) LongestConsensus(id string, max_length int) (*obiseq.BioSequence, error) { + //path := g.LongestPath(max_length) + path := g.HaviestPath() s := g.DecodePath(path) if len(s) > 0 { @@ -238,37 +312,10 @@ func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error) return nil, fmt.Errorf("cannot identify optimum path") } -func (g *DeBruijnGraph) Heads() []uint64 { - rep := make([]uint64, 0, 10) - - for k := range g.graph { - if len(g.Previouses(k)) == 0 { - rep = append(rep, k) - } - } - - return rep -} - -func (g *DeBruijnGraph) MaxHead() (uint64, bool) { - rep := uint64(0) - max := uint(0) - found := false - for k, w := range g.graph { - if len(g.Previouses(k)) == 0 && w > max { - rep = k - found = true - } - } - - return rep, found -} - func (g *DeBruijnGraph) DecodeNode(index uint64) string { rep := make([]byte, g.kmersize) - index >>= 2 for i := g.kmersize - 1; i >= 0; i-- { - rep[i], _ = decode[index&3] + rep[i] = decode[index&3] index >>= 2 } @@ -282,7 +329,7 @@ func (g *DeBruijnGraph) DecodePath(path []uint64) string { if len(path) > 0 { buf.WriteString(g.DecodeNode(path[0])) - for _, idx := range path { + for _, idx := range path[1:] { buf.WriteByte(decode[idx&3]) } } @@ -307,6 +354,13 @@ func (g *DeBruijnGraph) BestConsensus(id string) (*obiseq.BioSequence, error) { return nil, fmt.Errorf("cannot identify optimum path") } +// Weight returns the weight of the node at the given index in the DeBruijnGraph. +// +// Parameters: +// - index: the index of the node in the graph. +// +// Returns: +// - int: the weight of the node. func (g *DeBruijnGraph) Weight(index uint64) int { val, ok := g.graph[index] if !ok { @@ -315,59 +369,64 @@ func (g *DeBruijnGraph) Weight(index uint64) int { return int(val) } +// append appends a sequence of nucleotides to the DeBruijnGraph. +// +// Parameters: +// - sequence: a byte slice representing the sequence of nucleotides to append. +// - current: the current node in the graph to which the sequence will be appended. +// - weight: the weight of the added nodes. func (graph *DeBruijnGraph) append(sequence []byte, current uint64, weight int) { - for i := 0; i < len(sequence); i++ { - current <<= 2 - current &= graph.kmermask - b := iupac[sequence[i]] - if len(b) == 1 { - current |= b[0] - graph.graph[current] = uint(graph.Weight(current) + weight) - } else { - for j := 0; j < len(b); j++ { - current &= ^uint64(3) - current |= b[j] + if len(sequence) == 0 { + return + } - graph.graph[current] = uint(graph.Weight(current) + weight) - graph.append(sequence[(i+1):], current, weight) - } - return - } + current <<= 2 + current &= graph.kmermask + b := iupac[sequence[0]] + current |= b[0] + graph.graph[current] = uint(graph.Weight(current) + weight) + graph.append(sequence[1:], current, weight) + for j := 1; j < len(b); j++ { + current &= ^uint64(3) + current |= b[j] + graph.graph[current] = uint(graph.Weight(current) + weight) + graph.append(sequence[1:], current, weight) } } func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) { - key := uint64(0) - s := sequence.Sequence() - w := sequence.Count() - init := make([]uint64, 0, 16) - var f func(start int, key uint64) - f = func(start int, key uint64) { - for i := start; i < graph.kmersize; i++ { - key <<= 2 - b := iupac[s[i]] - if len(b) == 1 { - key |= b[0] - } else { - for j := 0; j < len(b); j++ { - key &= ^uint64(3) - key |= b[j] - f(i+1, key) - } - return - } + s := sequence.Sequence() // Get the sequence as a byte slice + w := sequence.Count() // Get the weight of the sequence + + var initFirstKmer func(start int, key uint64) + + // Initialize the first k-mer + // start is the index of the nucleotide in the k-mer to add + // key is the value of the k-mer index before adding the start nucleotide + initFirstKmer = func(start int, key uint64) { + if start == 0 { + key = 0 + } + + if start < graph.kmersize { + key <<= 2 + b := iupac[s[start]] + + for _, code := range b { + key &= ^uint64(3) + key |= code + initFirstKmer(start+1, key) + } + } else { + graph.graph[key] = uint(graph.Weight(key) + w) + graph.append(s[graph.kmersize:], key, w) } - init = append(init, key&graph.kmermask) } if sequence.Len() > graph.kmersize { - f(0, key) - - for _, idx := range init { - graph.append(s[graph.kmersize:], idx, w) - } + initFirstKmer(0, 0) } } @@ -381,43 +440,51 @@ func (graph *DeBruijnGraph) Gml() string { `) + nodeidx := make(map[uint64]int) + nodeid := 0 + for idx := range graph.graph { - node := graph.DecodeNode(idx) - buffer.WriteString( - fmt.Sprintf("node [ id \"%s\" ]\n", node), - ) - n := graph.Nexts(uint64(idx)) - if len(n) == 0 { - idx <<= 2 - idx &= graph.kmermask + nodeid++ + nodeidx[idx] = nodeid + n := graph.Nexts(idx) + p := graph.Previouses(idx) + + if len(n) == 0 || len(p) == 0 { + node := graph.DecodeNode(idx) buffer.WriteString( - fmt.Sprintf("node [ id \"%s\" \n label \"%s\" ]\n", node, node), + fmt.Sprintf("node [ id \"%d\" \n label \"%s\" ]\n", nodeid, node), + ) + } else { + buffer.WriteString( + fmt.Sprintf("node [ id \"%d\" ]\n", nodeid), ) } } - for idx, weight := range graph.graph { - src := graph.DecodeNode(idx) - label := decode[idx&3] - idx <<= 2 - idx &= graph.kmermask - dst := graph.DecodeNode(idx) - - buffer.WriteString( - fmt.Sprintf(`edge [ source "%s" - target "%s" - color "#00FF00" - label "%c[%d]" - graphics [ - width %f - arrow "last" - fill "#007F80" + for idx := range graph.graph { + srcid := nodeidx[idx] + n := graph.Nexts(idx) + for _, dst := range n { + dstid := nodeidx[dst] + label := decode[dst&3] + weight := graph.Weight(dst) + buffer.WriteString( + fmt.Sprintf(`edge [ source "%d" + target "%d" + color "#00FF00" + label "%c[%d]" + graphics [ + width %f + arrow "last" + fill "#007F80" + ] ] - ] - - `, src, dst, label, weight, math.Log(float64(weight))), - ) + + `, srcid, dstid, label, weight, math.Sqrt(float64(weight))), + ) + } + } buffer.WriteString("]\n") return buffer.String() @@ -472,3 +539,151 @@ func (g *DeBruijnGraph) HammingDistance(kmer1, kmer2 uint64) int { ident &= 0x5555555555555555 & g.kmermask return bits.OnesCount64(ident) } + +type UInt64Heap []uint64 + +func (h UInt64Heap) Len() int { return len(h) } +func (h UInt64Heap) Less(i, j int) bool { return h[i] < h[j] } +func (h UInt64Heap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } + +func (h *UInt64Heap) Push(x any) { + // Push and Pop use pointer receivers because they modify the slice's length, + // not just its contents. + *h = append(*h, x.(uint64)) +} + +func (h *UInt64Heap) Pop() any { + old := *h + n := len(old) + x := old[n-1] + *h = old[0 : n-1] + return x +} + +func (g *DeBruijnGraph) HaviestPath() []uint64 { + + if g.HasCycle() { + return nil + } + // Initialize the distance array and visited set + distances := make(map[uint64]int) + visited := make(map[uint64]bool) + prevNodes := make(map[uint64]uint64) + heaviestNode := uint64(0) + heaviestWeight := 0 + + queue := &UInt64Heap{} + heap.Init(queue) + + startNodes := make(map[uint64]struct{}) + for _, n := range g.Heads() { + startNodes[n] = struct{}{} + heap.Push(queue, n) + distances[n] = g.Weight(n) + prevNodes[n] = 0 + visited[n] = false + } + + // Priority queue to keep track of nodes to visit + for len(*queue) > 0 { + // Get the node with the smallest distance + currentNode := heap.Pop(queue).(uint64) + + // If the current node has already been visited, skip it + if visited[currentNode] { + continue + } + + // Mark the node as visited + visited[currentNode] = true + weight := distances[currentNode] + + // Update the heaviest node + if weight > heaviestWeight { + heaviestWeight = weight + heaviestNode = currentNode + } + + if currentNode == 0 { + log.Warn("current node is 0") + } + // Update the distance of the neighbors + nextNodes := g.Nexts(currentNode) + for _, nextNode := range nextNodes { + if nextNode == 0 { + log.Warn("next node is 0") + } + weight := g.Weight(nextNode) + distances[currentNode] + if distances[nextNode] < weight { + distances[nextNode] = weight + prevNodes[nextNode] = currentNode + visited[nextNode] = false + heap.Push(queue, nextNode) + + // Keep track of the node with the heaviest weight + if weight > heaviestWeight { + heaviestWeight = weight + heaviestNode = nextNode + } + } + } + } + + log.Infof("Heaviest node: %d [%v]", heaviestNode, heaviestWeight) + // Reconstruct the path from the start node to the heaviest node found + heaviestPath := make([]uint64, 0) + currentNode := heaviestNode + for _, ok := startNodes[currentNode]; !ok && !slices.Contains(heaviestPath, currentNode); _, ok = startNodes[currentNode] { + heaviestPath = append(heaviestPath, currentNode) + //log.Infof("Current node: %d <- %d", currentNode, prevNodes[currentNode]) + currentNode = prevNodes[currentNode] + } + + if slices.Contains(heaviestPath, currentNode) { + log.Fatalf("Cycle detected %v -> %v (%v) len(%v)", heaviestPath, currentNode, startNodes, len(heaviestPath)) + return nil + } + + heaviestPath = append(heaviestPath, currentNode) + + // Reverse the path + slices.Reverse(heaviestPath) + + return heaviestPath +} + +func (g *DeBruijnGraph) HasCycle() bool { + // Initialize the visited and stack arrays + visited := make(map[uint64]bool) + stack := make(map[uint64]bool) + + // Helper function to perform DFS + var dfs func(node uint64) bool + dfs = func(node uint64) bool { + visited[node] = true + stack[node] = true + + nextNodes := g.Nexts(node) + for _, nextNode := range nextNodes { + if !visited[nextNode] { + if dfs(nextNode) { + return true + } + } else if stack[nextNode] { + return true + } + } + stack[node] = false + return false + } + + // Perform DFS on each node to check for cycles + for node := range g.graph { + if !visited[node] { + if dfs(node) { + return true + } + } + } + return false +} diff --git a/pkg/obingslibrary/match.go b/pkg/obingslibrary/match.go index 898bf13..5621aed 100644 --- a/pkg/obingslibrary/match.go +++ b/pkg/obingslibrary/match.go @@ -130,7 +130,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { sseq := sequence.String() direct := sseq[start:end] - tagstart := obiutils.MaxInt(start-marker.taglength, 0) + tagstart := obiutils.Max(start-marker.taglength, 0) ftag := strings.ToLower(sseq[tagstart:start]) m := DemultiplexMatch{ @@ -150,7 +150,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { reverse, _ := sequence.Subsequence(start, end, false) defer reverse.Recycle() reverse = reverse.ReverseComplement(true) - endtag := obiutils.MinInt(end+marker.taglength, sequence.Len()) + endtag := obiutils.Min(end+marker.taglength, sequence.Len()) rtag, err := sequence.Subsequence(end, endtag, false) defer rtag.Recycle() srtag := "" @@ -201,7 +201,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { sseq := sequence.String() reverse := strings.ToLower(sseq[start:end]) - tagstart := obiutils.MaxInt(start-marker.taglength, 0) + tagstart := obiutils.Max(start-marker.taglength, 0) rtag := strings.ToLower(sseq[tagstart:start]) m := DemultiplexMatch{ @@ -221,7 +221,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { defer direct.Recycle() direct = direct.ReverseComplement(true) - endtag := obiutils.MinInt(end+marker.taglength, sequence.Len()) + endtag := obiutils.Min(end+marker.taglength, sequence.Len()) ftag, err := sequence.Subsequence(end, endtag, false) defer ftag.Recycle() sftag := "" diff --git a/pkg/obiseq/language.go b/pkg/obiseq/language.go index ed54e41..4eeaf2a 100644 --- a/pkg/obiseq/language.go +++ b/pkg/obiseq/language.go @@ -38,13 +38,7 @@ import ( // return float64(m) // } -// func minIntVector(values []int) float64 { -// m := values[0] -// for _, v := range values { -// if v < m { -// m = v -// } -// } + // return float64(m) // } diff --git a/pkg/obisuffix/suffix_array.go b/pkg/obisuffix/suffix_array.go index ae1f5f4..cb28931 100644 --- a/pkg/obisuffix/suffix_array.go +++ b/pkg/obisuffix/suffix_array.go @@ -27,7 +27,7 @@ func SuffixLess(suffixarray SuffixArray) func(i, j int) bool { sj := suffixarray.Suffixes[j] bj := (*suffixarray.Sequences)[int(sj.Idx)].Sequence()[sj.Pos:] - l := obiutils.MinInt(len(bi), len(bj)) + l := obiutils.Min(len(bi), len(bj)) p := 0 for p < l && bi[p] == bj[p] { p++ @@ -92,7 +92,7 @@ func (suffixarray *SuffixArray) CommonSuffix() []int { si := suffixarray.Suffixes[i] bi := (*suffixarray.Sequences)[int(si.Idx)].Sequence()[si.Pos:] - l := obiutils.MinInt(len(bi), len(bp)) + l := obiutils.Min(len(bi), len(bp)) p := 0 for p < l && bi[p] == bp[p] { p++ diff --git a/pkg/obitools/obiconsensus/obiconsensus.go b/pkg/obitools/obiconsensus/obiconsensus.go index d02ece8..3dcb628 100644 --- a/pkg/obitools/obiconsensus/obiconsensus.go +++ b/pkg/obitools/obiconsensus/obiconsensus.go @@ -4,92 +4,93 @@ import ( "fmt" "os" "path" - "sort" + "slices" log "github.com/sirupsen/logrus" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obisuffix" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) func BuildConsensus(seqs obiseq.BioSequenceSlice, + consensus_id string, kmer_size int, quorum float64, min_depth float64, + max_length int, save_graph bool, dirname string) (*obiseq.BioSequence, error) { + if save_graph { + if dirname == "" { + dirname = "." + } + + if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() { + // path does not exist or is not directory + os.RemoveAll(dirname) + err := os.Mkdir(dirname, 0755) + + if err != nil { + log.Panicf("Cannot create directory %s for saving graphs", dirname) + } + } + + fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s.fasta", consensus_id))) + + if err == nil { + defer fasta.Close() + fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch(0, seqs), obiformats.FormatFastSeqJsonHeader, false)) + fasta.Close() + } + + } + log.Printf("Number of reads : %d\n", len(seqs)) if kmer_size < 0 { longest := make([]int, len(seqs)) - for i := range seqs { - s := seqs[i : i+1] + for i, seq := range seqs { + s := obiseq.BioSequenceSlice{seq} sa := obisuffix.BuildSuffixArray(&s) - longest[i] = obiutils.MaxSlice(sa.CommonSuffix()) + longest[i] = slices.Max(sa.CommonSuffix()) } - o := obiutils.Order(sort.IntSlice(longest)) - i := int(float64(len(seqs)) * quorum) + // o := obiutils.Order(sort.IntSlice(longest)) + // i := int(float64(len(seqs)) * quorum) - kmer_size = longest[o[i]] + 1 + // if i >= len(o) { + // i = len(o) - 1 + // } + + kmer_size = slices.Max(longest) + 1 + + // kmer_size = longest[o[i]] + 1 log.Printf("estimated kmer size : %d", kmer_size) } - graph := obikmer.MakeDeBruijnGraph(kmer_size) + var graph *obikmer.DeBruijnGraph + for { + graph = obikmer.MakeDeBruijnGraph(kmer_size) - for _, s := range seqs { - graph.Push(s) + for _, s := range seqs { + graph.Push(s) + } + + if !graph.HasCycle() { + break + } + + kmer_size++ + log.Infof("Cycle detected, increasing kmer size to %d\n", kmer_size) } - log.Printf("Graph size : %d\n", graph.Len()) - total_kmer := graph.Len() - - threshold := 0 - - switch { - case min_depth < 0: - spectrum := graph.LinkSpectrum() - cum := make(map[int]int) - - spectrum[1] = 0 - for i := 2; i < len(spectrum); i++ { - spectrum[i] += spectrum[i-1] - cum[spectrum[i]]++ - } - - max := 0 - kmax := 0 - for k, obs := range cum { - if obs > max { - max = obs - kmax = k - } - } - - for i, total := range spectrum { - if total == kmax { - threshold = i - break - } - } - threshold /= 2 - case min_depth >= 1: - threshold = int(min_depth) - default: - threshold = int(float64(len(seqs)) * min_depth) - } - - graph.FilterMin(threshold) - - log.Printf("Graph size : %d\n", graph.Len()) - if save_graph { file, err := os.Create(path.Join(dirname, - fmt.Sprintf("%s.gml", seqs[0].Source()))) + fmt.Sprintf("%s_raw_consensus.gml", consensus_id))) if err != nil { fmt.Println(err) @@ -99,29 +100,133 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, } } - id := seqs[0].Source() - if id == "" { - id = seqs[0].Id() - } - seq, err := graph.LongestConsensus(id) + log.Printf("Graph size : %d\n", graph.Len()) + total_kmer := graph.Len() + + // threshold := 0 + + // switch { + // case min_depth < 0: + // spectrum := graph.WeightSpectrum() + // cum := make(map[int]int) + + // spectrum[1] = 0 + // for i := 2; i < len(spectrum); i++ { + // spectrum[i] += spectrum[i-1] + // cum[spectrum[i]]++ + // } + + // max := 0 + // kmax := 0 + // for k, obs := range cum { + // if obs > max { + // max = obs + // kmax = k + // } + // } + + // for i, total := range spectrum { + // if total == kmax { + // threshold = i + // break + // } + // } + // threshold /= 2 + + // if threshold < 1 { + // threshold = 1 + // } + + // log.Info("Estimated kmer_min_occur = ", threshold) + // case min_depth >= 1: + // threshold = int(min_depth) + // default: + // threshold = int(float64(len(seqs)) * min_depth) + // } + + // graph.FilterMinWeight(threshold) + + // log.Printf("Graph size : %d\n", graph.Len()) + + // if save_graph { + + // file, err := os.Create(path.Join(dirname, + // fmt.Sprintf("%s_consensus.gml", consensus_id))) + + // if err != nil { + // fmt.Println(err) + // } else { + // file.WriteString(graph.Gml()) + // file.Close() + // } + // } + + seq, err := graph.LongestConsensus(consensus_id, max_length) sumCount := 0 - for _, s := range seqs { - sumCount += s.Count() + if seq != nil { + for _, s := range seqs { + sumCount += s.Count() + } + + seq.SetCount(sumCount) + seq.SetAttribute("seq_length", seq.Len()) + seq.SetAttribute("kmer_size", kmer_size) + //seq.SetAttribute("kmer_min_occur", threshold) + seq.SetAttribute("kmer_max_occur", graph.MaxWeight()) + seq.SetAttribute("filtered_graph_size", graph.Len()) + seq.SetAttribute("full_graph_size", total_kmer) } - - seq.SetCount(sumCount) - seq.SetAttribute("seq_length", seq.Len()) - seq.SetAttribute("kmer_size", kmer_size) - seq.SetAttribute("kmer_min_occur", threshold) - seq.SetAttribute("kmer_max_occur", graph.MaxLink()) - seq.SetAttribute("filtered_graph_size", graph.Len()) - seq.SetAttribute("full_graph_size", total_kmer) - return seq, err } +// func BuildConsensusWithTimeout(seqs obiseq.BioSequenceSlice, +// kmer_size int, quorum float64, +// min_depth float64, +// save_graph bool, dirname string, timeout time.Duration) (*obiseq.BioSequence, error) { + +// ctx, cancel := context.WithTimeout(context.Background(), timeout) +// defer cancel() + +// consensus := func() *obiseq.BioSequence { +// cons, err := BuildConsensus(seqs, kmer_size, quorum, min_depth, save_graph, dirname,) +// if err != nil { +// cons = nil +// } + +// return cons +// } + +// computation := func() <-chan *obiseq.BioSequence { +// result := make(chan *obiseq.BioSequence) + +// go func() { +// select { +// case <-ctx.Done(): +// result <- nil +// default: +// result <- consensus() + +// } +// }() + +// return result +// } + +// calcResult := computation() + +// select { +// case result := <-calcResult: +// if result == nil { +// return nil, fmt.Errorf("cannot compute consensus") +// } +// return result, nil +// case <-ctx.Done(): +// return nil, fmt.Errorf("compute consensus timeout, exiting") +// } +// } + func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence { newIter := obiiter.MakeIBioSequence() size := 10 @@ -153,10 +258,19 @@ func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence { for iterator.Next() { seqs := iterator.Get() - consensus, err := BuildConsensus(seqs.Slice(), + sequences := seqs.Slice() + + id := sequences[0].Source() + if id == "" { + id = sequences[0].Id() + } + consensus, err := BuildConsensus(sequences, + id, CLIKmerSize(), CLIThreshold(), CLIKmerDepth(), - CLISaveGraphToFiles(), CLIGraphFilesDirectory(), + CLIMaxConsensusLength(), + CLISaveGraphToFiles(), + CLIGraphFilesDirectory(), ) if err == nil { diff --git a/pkg/obitools/obiconsensus/options.go b/pkg/obitools/obiconsensus/options.go index 098784e..83c8d03 100644 --- a/pkg/obitools/obiconsensus/options.go +++ b/pkg/obitools/obiconsensus/options.go @@ -9,6 +9,7 @@ var _saveGraph = "__@@NOSAVE@@__" var _kmerSize = -1 var _threshold = 0.99 var _mindepth = -1.0 +var _consensus_max_length = -1 func ObiconsensusOptionSet(options *getoptions.GetOpt) { @@ -38,6 +39,12 @@ func ObiconsensusOptionSet(options *getoptions.GetOpt) { "Default value = -1, which means that the DEPTH is estimated from the data"), ) + options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length, + options.ArgName("LENGTH"), + options.Description("Maximum length of the consensus sequence. "+ + "Default value = -1, which means that no limit is applied"), + ) + } func OptionSet(options *getoptions.GetOpt) { @@ -67,3 +74,7 @@ func CLIKmerDepth() float64 { func CLIThreshold() float64 { return _threshold } + +func CLIMaxConsensusLength() int { + return _consensus_max_length +} diff --git a/pkg/obitools/obiminion/obiminion.go b/pkg/obitools/obiminion/obiminion.go new file mode 100644 index 0000000..ed6d6b7 --- /dev/null +++ b/pkg/obitools/obiminion/obiminion.go @@ -0,0 +1,290 @@ +package obiminion + +import ( + "fmt" + "os" + "sync" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obigraph" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiannotate" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiuniq" + "github.com/schollz/progressbar/v3" + log "github.com/sirupsen/logrus" +) + +// SampleWeight calculates the weight of a sample based on the statistics of a sequence. +// +// Parameters: +// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice) +// - sample: the sample for which the weight is calculated (string) +// - sample_key: the key used to access the sample's statistics (string) +// Return type: a function that takes an integer index and returns the weight of the sample at that index (func(int) int) +func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func(int) float64 { + + f := func(i int) float64 { + + stats := (*seqs)[i].StatsOn(sample_key, "NA") + + if value, ok := stats[sample]; ok { + return float64(value) + } + + return 0 + } + + return f +} + +// SeqBySamples sorts the sequences by samples. +// +// Parameters: +// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice) +// - sample_key: a string representing the sample key (string) +// +// Return type: +// - map[string]BioSequenceSlice: a map indexed by sample names, each containing a slice of BioSequence objects (map[string]BioSequenceSlice) +func SeqBySamples(seqs obiseq.BioSequenceSlice, sample_key string) map[string]*obiseq.BioSequenceSlice { + + samples := make(map[string]*obiseq.BioSequenceSlice) + + for _, s := range seqs { + if s.HasStatsOn(sample_key) { + stats := s.StatsOn(sample_key, "NA") + for k := range stats { + if seqset, ok := samples[k]; ok { + *seqset = append(*seqset, s) + samples[k] = seqset + } else { + samples[k] = &obiseq.BioSequenceSlice{s} + } + } + } else { + if k, ok := s.GetStringAttribute(sample_key); ok { + if seqset, ok := samples[k]; ok { + *seqset = append(*seqset, s) + samples[k] = seqset + } else { + samples[k] = &obiseq.BioSequenceSlice{s} + } + } + } + } + + return samples + +} + +type Mutation struct { + Position int + SeqA byte + SeqB byte + Ratio float64 +} + +func BuildDiffSeqGraph(name, name_key string, + seqs *obiseq.BioSequenceSlice, + distmax, nworkers int) *obigraph.Graph[*obiseq.BioSequence, Mutation] { + graph := obigraph.NewGraphBuffer[*obiseq.BioSequence, Mutation](name, (*[]*obiseq.BioSequence)(seqs)) + iseq := make(chan int) + defer graph.Close() + + ls := len(*seqs) + + sw := SampleWeight(seqs, name, name_key) + graph.Graph.VertexWeight = sw + + waiting := sync.WaitGroup{} + waiting.Add(nworkers) + + bar := (*progressbar.ProgressBar)(nil) + if obiconvert.CLIProgressBar() { + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription(fmt.Sprintf("[Build graph] on %s", name)), + ) + + bar = progressbar.NewOptions(len(*seqs), pbopt...) + } + + computeEdges := func() { + defer waiting.Done() + for i := range iseq { + s1 := (*seqs)[i] + for j := i + 1; j < ls; j++ { + s2 := (*seqs)[j] + ratio := sw(i) / sw(j) + ok, pos, a1, a2 := obialign.D1Or0(s1, s2) + if ok >= 0 { + graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio}) + } else if distmax > 1 { + lcs, lali := obialign.FastLCSScore(s1, s2, distmax, nil) + dist := lali - lcs + if lcs > 0 && dist <= distmax { + // log.Infof("Seq %s and %s: LCSScore: %d, dist: %d\n", s1.Id(), s2.Id(), lcs, dist) + graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio}) + } + } + } + + if bar != nil { + bar.Add(1) + } + } + } + + for i := 0; i < nworkers; i++ { + go computeEdges() + } + + for i := 0; i < ls; i++ { + iseq <- i + } + close(iseq) + + waiting.Wait() + return graph.Graph +} + +func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation], + sample_key string, kmer_size int, max_length int, threshold float64, depth float64) obiseq.BioSequenceSlice { + denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices)) + + for i, v := range *graph.Vertices { + var err error + var clean *obiseq.BioSequence + degree := graph.Degree(i) + if degree > 4 { + pack := obiseq.MakeBioSequenceSlice(degree + 1) + for k,j := range graph.Neighbors(i) { + pack[k] = (*graph.Vertices)[j] + } + pack[degree] = v + clean, err = obiconsensus.BuildConsensus(pack, + fmt.Sprintf("%s_consensus", v.Id()), + kmer_size, + threshold, + depth, max_length, + CLISaveGraphToFiles(), CLIGraphFilesDirectory()) + + if err != nil { + log.Warning(err) + clean = (*graph.Vertices)[i] + clean.SetAttribute("obiminion_consensus", false) + } else { + clean.SetAttribute("obiminion_consensus", true) + } + pack.Recycle(false) + } else { + clean = obiseq.NewBioSequence(v.Id(), v.Sequence(), v.Definition()) + clean.SetAttribute("obiminion_consensus", false) + } + + clean.SetCount(int(graph.VertexWeight(i))) + clean.SetAttribute(sample_key, graph.Name) + + denoised[i] = clean + } + + return denoised +} +func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence { + dirname := CLIGraphFilesDirectory() + newIter := obiiter.MakeIBioSequence() + + db := itertator.Load() + + log.Infof("Sequence dataset of %d sequeences loaded\n", len(db)) + + samples := SeqBySamples(db, CLISampleAttribute()) + db.Recycle(false) + + log.Infof("Dataset composed of %d samples\n", len(samples)) + if CLIMaxConsensusLength() > 0 { + log.Infof("Maximum consensus length: %d\n", CLIMaxConsensusLength()) + } + + log.Infof("Dataset composed of %d samples\n", len(samples)) + + if CLISaveGraphToFiles() { + if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() { + // path does not exist or is not directory + os.RemoveAll(dirname) + err := os.Mkdir(dirname, 0755) + + if err != nil { + log.Panicf("Cannot create directory %s for saving graphs", dirname) + } + } + } + + bar := (*progressbar.ProgressBar)(nil) + if obiconvert.CLIProgressBar() { + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription("[Filter graph on abundance ratio]"), + ) + + bar = progressbar.NewOptions(len(samples), pbopt...) + } + + newIter.Add(1) + + go func() { + sample_order := 0 + for sample, seqs := range samples { + graph := BuildDiffSeqGraph(sample, + CLISampleAttribute(), + seqs, + CLIDistStepMax(), + obioptions.CLIParallelWorkers()) + if bar != nil { + bar.Add(1) + } + + if CLISaveGraphToFiles() { + graph.WriteGmlFile(fmt.Sprintf("%s/%s.gml", + CLIGraphFilesDirectory(), + sample), + false, 1, 0, 3) + } + + denoised := MinionDenoise(graph, + CLISampleAttribute(), + CLIKmerSize(), + CLIMaxConsensusLength(), + CLIThreshold(), + CLIKmerDepth()) + + newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised)) + + sample_order++ + } + + newIter.Done() + }() + + go func() { + newIter.WaitAndClose() + }() + + obiuniq.AddStatsOn(CLISampleAttribute()) + obiuniq.SetUniqueInMemory(false) + obiuniq.SetNoSingleton(CLINoSingleton()) + return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false)) +} diff --git a/pkg/obitools/obiminion/options.go b/pkg/obitools/obiminion/options.go new file mode 100644 index 0000000..68f7681 --- /dev/null +++ b/pkg/obitools/obiminion/options.go @@ -0,0 +1,179 @@ +package obiminion + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +var _distStepMax = 1 +var _sampleAttribute = "sample" + +var _ratioMax = 1.0 +var _minEvalRate = 1000 + +var _clusterMode = false +var _onlyHead = false + +var _kmerSize = -1 +var _threshold = 1.0 +var _mindepth = -1.0 + +var _consensus_max_length = 1000 + +var _NoSingleton = false + +var _saveGraph = "__@@NOSAVE@@__" +var _saveRatio = "__@@NOSAVE@@__" + +// ObiminionOptionSet sets the options for obiminion. +// +// options: The options for configuring obiminion. +func ObiminionOptionSet(options *getoptions.GetOpt) { + options.StringVar(&_sampleAttribute, "sample", _sampleAttribute, + options.Alias("s"), + options.Description("Attribute containing sample descriptions (default %s).")) + + options.IntVar(&_distStepMax, "distance", _distStepMax, + options.Alias("d"), + options.Description("Maximum numbers of differences between two variant sequences (default: %d).")) + + options.IntVar(&_minEvalRate, "min-eval-rate", _minEvalRate, + options.Description("Minimum abundance of a sequence to be used to evaluate mutation rate.")) + + options.StringVar(&_saveGraph, "save-graph", _saveGraph, + options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+ + "The graph files follow the graphml format."), + ) + + options.StringVar(&_saveRatio, "save-ratio", _saveRatio, + options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+ + "The ratio file follows the csv format."), + ) + options.IntVar(&_kmerSize, "kmer-size", _kmerSize, + options.ArgName("SIZE"), + options.Description("The size of the kmer used to build the consensus. "+ + "Default value = -1, which means that the kmer size is estimated from the data"), + ) + + options.Float64Var(&_threshold, "threshold", _threshold, + options.ArgName("RATIO"), + options.Description("A threshold between O and 1 used to determine the optimal "+ + "kmer size"), + ) + + options.Float64Var(&_mindepth, "min-depth", _mindepth, + options.ArgName("DEPTH"), + options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+ + "reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+ + "than 1, indicate the minimum count of occurrence for a kmer to be kept. "+ + "Default value = -1, which means that the DEPTH is estimated from the data"), + ) + + options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length, + options.ArgName("LENGTH"), + options.Description("Maximum length of the consensus sequence. "+ + "Default value = -1, which means that no limit is applied"), + ) + + options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton, + options.Description("If set, sequences occurring a single time in the data set are discarded.")) + + +} + +// OptionSet sets up the options for the obiminion package. +// +// It takes a pointer to a getoptions.GetOpt object as a parameter. +// It does not return any value. +func OptionSet(options *getoptions.GetOpt) { + obiconvert.InputOptionSet(options) + obiconvert.OutputOptionSet(options) + ObiminionOptionSet(options) +} + +// CLIDistStepMax returns the maximum distance between two sequences. +// +// The value of the distance is set by the user with the `-d` flag. +// +// No parameters. +// Returns an integer. +func CLIDistStepMax() int { + return _distStepMax +} + +// CLISampleAttribute returns the name of the attribute used to store sample name. +// +// The value of the sample attribute is set by the user with the `-s` flag. +// +// No parameters. +// Returns a string. +func CLISampleAttribute() string { + return _sampleAttribute +} + +// > The function `CLIMinCountToEvalMutationRate()` returns the minimum number of reads that must be +// observed before the mutation rate can be evaluated +func CLIMinCountToEvalMutationRate() int { + return _minEvalRate +} + +func ClusterMode() bool { + return _clusterMode +} + +// `OnlyHead()` returns a boolean value that indicates whether the `-h` flag was passed to the program +func OnlyHead() bool { + return _onlyHead +} + +// Returns true it the obliclean graphs must be saved +func CLISaveGraphToFiles() bool { + return _saveGraph != "__@@NOSAVE@@__" +} + +// It returns the directory where the graph files are saved +func CLIGraphFilesDirectory() string { + return _saveGraph +} + +// Returns true it the table of ratio must be saved +func IsSaveRatioTable() bool { + return _saveRatio != "__@@NOSAVE@@__" +} + +// It returns the filename of the file that stores the ratio table +func RatioTableFilename() string { + return _saveRatio +} + +// CLIKmerSize returns the value of the kmer size to use for building the consensus. +// +// The value of the kmer size is set by the user with the `-k` flag. +// The value -1 means that the kmer size is estimated as the minimum value that +// insure that no kmer are present more than one time in a sequence. +// +// No parameters. +// Returns an integer value. +func CLIKmerSize() int { + return _kmerSize +} + +func CLIKmerDepth() float64 { + return _mindepth +} + +func CLIThreshold() float64 { + return _threshold +} + +func CLIMaxConsensusLength() int { + return _consensus_max_length +} + +// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded. +// +// No parameters. +// Returns a boolean value indicating whether or not singleton sequences should be discarded. +func CLINoSingleton() bool { + return _NoSingleton +} \ No newline at end of file diff --git a/pkg/obitools/obipcr/pcr.go b/pkg/obitools/obipcr/pcr.go index 3b36b9d..e9f03d9 100644 --- a/pkg/obitools/obipcr/pcr.go +++ b/pkg/obitools/obipcr/pcr.go @@ -47,8 +47,8 @@ func CLIPCR(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error) { frags := obiiter.IFragments( CLIMaxLength()*1000, CLIMaxLength()*100, - CLIMaxLength()+obiutils.MaxInt(len(CLIForwardPrimer()), - len(CLIReversePrimer()))+obiutils.MinInt(len(CLIForwardPrimer()), + CLIMaxLength()+obiutils.Max(len(CLIForwardPrimer()), + len(CLIReversePrimer()))+obiutils.Min(len(CLIForwardPrimer()), len(CLIReversePrimer()))/2, 100, obioptions.CLIParallelWorkers(), diff --git a/pkg/obitools/obirefidx/obirefidx.go b/pkg/obitools/obirefidx/obirefidx.go index 2f897b6..39d1c15 100644 --- a/pkg/obitools/obirefidx/obirefidx.go +++ b/pkg/obitools/obirefidx/obirefidx.go @@ -63,7 +63,7 @@ func IndexSequence(seqidx int, if lca[order] == ancestor { // nseq[i]++ if mini != -1 { - wordmin = obiutils.MaxInt(sequence.Len(), references[order].Len()) - 3 - 4*mini + wordmin = obiutils.Max(sequence.Len(), references[order].Len()) - 3 - 4*mini } if cw[order] < wordmin { @@ -189,7 +189,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { indexed := obiiter.MakeIBioSequence() go func() { for i := 0; i < len(references); i += 10 { - limits <- [2]int{i, obiutils.MinInt(i+10, len(references))} + limits <- [2]int{i, obiutils.Min(i+10, len(references))} } close(limits) }() diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go index 48318d3..b4b8e84 100644 --- a/pkg/obitools/obitag/obitag.go +++ b/pkg/obitools/obitag/obitag.go @@ -110,7 +110,7 @@ func FindClosests(sequence *obiseq.BioSequence, d, _, _, _ := obialign.D1Or0(sequence, references[order]) if d >= 0 { score = d - alilength = obiutils.MaxInt(sequence.Len(), ref.Len()) + alilength = obiutils.Max(sequence.Len(), ref.Len()) lcs = alilength - score } } else { diff --git a/pkg/obitools/obiuniq/options.go b/pkg/obitools/obiuniq/options.go index 5620fa5..1b56fae 100644 --- a/pkg/obitools/obiuniq/options.go +++ b/pkg/obitools/obiuniq/options.go @@ -12,6 +12,11 @@ var _chunks = 100 var _NAValue = "NA" var _NoSingleton = false +// UniqueOptionSet sets up unique options for the obiuniq command. +// +// It configures various options such as merging attributes, category attributes, +// defining the NA value, handling singleton sequences, choosing between in-memory +// or disk storage, and specifying the chunk count for dataset division. func UniqueOptionSet(options *getoptions.GetOpt) { options.StringSliceVar(&_StatsOn, "merge", 1, 1, @@ -40,25 +45,67 @@ func UniqueOptionSet(options *getoptions.GetOpt) { } + // OptionSet adds to the basic option set every options declared for // the obiuniq command +// +// It takes a pointer to a GetOpt struct as its parameter and does not return anything. func OptionSet(options *getoptions.GetOpt) { obiconvert.OptionSet(options) UniqueOptionSet(options) } +// CLIStatsOn returns the list of variables on witch statistics are computed. +// +// It does not take any parameters. +// It returns a slice of strings representing the statistics on values. func CLIStatsOn() []string { return _StatsOn } +// SetStatsOn sets the list of variables on witch statistics are computed. +// +// It takes a slice of strings as its parameter and does not return anything. +func SetStatsOn(statsOn []string) { + _StatsOn = statsOn +} + +// AddStatsOn adds a variable to the list of variables on witch statistics are computed. +// +// Parameters: +// - statsOn: variadic strings representing the statistics to be added. +func AddStatsOn(statsOn ...string) { + _StatsOn = append(_StatsOn, statsOn...) +} + +// CLIKeys returns the keys used to distinguished among identical sequences. +// +// It does not take any parameters. +// It returns a slice of strings representing the keys used by the CLI. func CLIKeys() []string { return _Keys } +// CLIUniqueInMemory returns if the unique function is running in memory only. +// +// It does not take any parameters. +// It returns a boolean value indicating whether the function is running in memory or not. func CLIUniqueInMemory() bool { return _InMemory } +// SetUniqueInMemory sets whether the unique function is running in memory or not. +// +// inMemory bool - A boolean value indicating whether the function is running in memory. +// No return value. +func SetUniqueInMemory(inMemory bool) { + _InMemory = inMemory +} + +// CLINumberOfChunks returns the number of chunks used for the first bucket sort step used by the unique function. +// +// It does not take any parameters. +// It returns an integer representing the number of chunks. func CLINumberOfChunks() int { if _chunks <= 1 { return 1 @@ -67,10 +114,40 @@ func CLINumberOfChunks() int { return _chunks } +// SetNumberOfChunks sets the number of chunks used for the first bucket sort step used by the unique function. +// +// chunks int - The number of chunks to be set. +// No return value. +func SetNumberOfChunks(chunks int) { + _chunks = chunks +} + +// CLINAValue returns the value used as a placeholder for missing values. +// +// No parameters. +// Return type: string. func CLINAValue() string { return _NAValue } +// SetNAValue sets the NA value to the specified string. +// +// value string - The value to set as the NA value. +func SetNAValue(value string) { + _NAValue = value +} + +// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded. +// +// No parameters. +// Returns a boolean value indicating whether or not singleton sequences should be discarded. func CLINoSingleton() bool { return _NoSingleton } + +// SetNoSingleton sets the boolean value indicating whether or not singleton sequences should be discarded. +// +// noSingleton bool - The boolean value to set for _NoSingleton. +func SetNoSingleton(noSingleton bool) { + _NoSingleton = noSingleton +} \ No newline at end of file diff --git a/pkg/obitools/obiuniq/unique.go b/pkg/obitools/obiuniq/unique.go index d04faa5..c651eff 100644 --- a/pkg/obitools/obiuniq/unique.go +++ b/pkg/obitools/obiuniq/unique.go @@ -8,7 +8,7 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" ) -func Unique(sequences obiiter.IBioSequence) obiiter.IBioSequence { +func CLIUnique(sequences obiiter.IBioSequence) obiiter.IBioSequence { options := make([]obichunk.WithOption, 0, 30) diff --git a/pkg/obiutils/minmax.go b/pkg/obiutils/minmax.go index d5bec7a..fdcc10a 100644 --- a/pkg/obiutils/minmax.go +++ b/pkg/obiutils/minmax.go @@ -1,69 +1,32 @@ package obiutils -import "golang.org/x/exp/constraints" +import ( + "golang.org/x/exp/constraints" +) -func MinInt(x, y int) int { +func Min[T constraints.Ordered](x, y T) T { if x < y { return x } return y } -func MaxInt(x, y int) int { +func Max[T constraints.Ordered](x, y T) T { if x < y { return y } return x } -func MinMaxInt(x, y int) (int, int) { +func MinMax[T constraints.Ordered](x, y T) (T, T) { if x < y { return x, y } return y, x } -func MinUInt16(x, y uint16) uint16 { - if x < y { - return x - } - return y -} -func MaxUInt16(x, y uint16) uint16 { - if x < y { - return y - } - return x -} - -func MinSlice[T constraints.Ordered](vec []T) T { - if len(vec) == 0 { - panic("empty slice") - } - min := vec[0] - for _, v := range vec { - if v < min { - min = v - } - } - return min -} - -func MaxSlice[T constraints.Ordered](vec []T) T { - if len(vec) == 0 { - panic("empty slice") - } - max := vec[0] - for _, v := range vec { - if v > max { - max = v - } - } - return max -} - -func RangeSlice[T constraints.Ordered](vec []T) (min, max T) { +func MinMaxSlice[T constraints.Ordered](vec []T) (min, max T) { if len(vec) == 0 { panic("empty slice") } diff --git a/pkg/obiutils/uint128.go b/pkg/obiutils/uint128.go index 7d71845..3548111 100644 --- a/pkg/obiutils/uint128.go +++ b/pkg/obiutils/uint128.go @@ -13,8 +13,8 @@ import ( // Zero is a zero-valued uint128. var Zero Uint128 -// Max is the largest possible uint128 value. -var Max = New(math.MaxUint64, math.MaxUint64) +// MaxUint128 is the largest possible uint128 value. +var MaxUint128 = New(math.MaxUint64, math.MaxUint64) // A Uint128 is an unsigned 128-bit number. type Uint128 struct {