From 2a11adb346ef579054955e8c3443c73d5ed66c04 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric.coissac@metabarcoding.org>
Date: Fri, 25 Aug 2023 14:36:38 +0200
Subject: [PATCH] Add some doc and switch to the parallel gzip library

Former-commit-id: 2c1187001f989ba3de5895f516d4c8b54d52a4c4
---
 pkg/obialign/dnamatrix.go        | 28 +++++++++
 pkg/obialign/fastlcsegf.go       | 60 +++++++++++++++++++
 pkg/obiformats/ecopcr_read.go    |  2 +-
 pkg/obiformats/universal_read.go |  4 +-
 pkg/obiiter/workers.go           |  3 +-
 pkg/obiutils/array.go            | 40 +++++++++++++
 pkg/obiutils/gzipfile.go         |  2 +-
 pkg/obiutils/set.go              | 99 ++++++++++++++++++++++++++++++++
 8 files changed, 233 insertions(+), 5 deletions(-)
 create mode 100644 pkg/obiutils/array.go
 create mode 100644 pkg/obiutils/set.go

diff --git a/pkg/obialign/dnamatrix.go b/pkg/obialign/dnamatrix.go
index 6d8cac0..fdb982c 100644
--- a/pkg/obialign/dnamatrix.go
+++ b/pkg/obialign/dnamatrix.go
@@ -29,20 +29,39 @@ var _NucPartMatch [32][32]float64
 var _NucScorePartMatchMatch [100][100]int
 var _NucScorePartMatchMismatch [100][100]int
 
+// _MatchRatio calculates the match ratio between two bytes.
+//
+// It takes two parameters, a and b, which are bytes to be compared.
+// The function returns a float64 value representing the match ratio.
 func _MatchRatio(a, b byte) float64 {
 	// count of common bits
 	cm := _FourBitsCount[a&b&15]
 
+	// count of bits in a
 	ca := _FourBitsCount[a&15]
+
+	// count of bits in b
 	cb := _FourBitsCount[b&15]
 
+	// check if any of the counts is zero
 	if cm == 0 || ca == 0 || cb == 0 {
 		return float64(0)
 	}
 
+	// calculate the match ratio
 	return float64(cm) / float64(ca) / float64(cb)
 }
 
+// _Logaddexp calculates the logarithm of the sum of exponentials of two given numbers.
+//
+// Parameters:
+//
+//	a - the first number (float64)
+//	b - the second number (float64)
+//
+// Returns:
+//
+//	float64 - the result of the calculation
 func _Logaddexp(a, b float64) float64 {
 	if a > b {
 		a, b = b, a
@@ -51,6 +70,15 @@ func _Logaddexp(a, b float64) float64 {
 	return b + math.Log1p(math.Exp(a-b))
 }
 
+// _MatchScoreRatio calculates the match score ratio between two bytes.
+//
+// Parameters:
+// - a: the first byte
+// - b: the second byte
+//
+// Returns:
+// - float64: the match score ratio when a match is observed
+// - float64: the match score ratio when a mismatch is observed
 func _MatchScoreRatio(a, b byte) (float64, float64) {
 
 	l2 := math.Log(2)
diff --git a/pkg/obialign/fastlcsegf.go b/pkg/obialign/fastlcsegf.go
index 943a2f6..586e799 100644
--- a/pkg/obialign/fastlcsegf.go
+++ b/pkg/obialign/fastlcsegf.go
@@ -31,6 +31,28 @@ func _samenuc(a, b byte) bool {
 	}
 	return a == b
 }
+
+// FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices.
+//
+// The score is calculated using the following scoring matrix:
+//   - Match : +1
+//   - Mismatch and gap: 0
+//
+// The LCS is calculated using the Needleman-Wunsch algorithm.
+// At the same time the length of the shortest path between the two sequences is calculated.
+// If the endgapfree flag is set to true, the returned length does not include the end gaps.
+// If the number of mismatches or gaps is larger than the maximum allowed error, -1 is returned for both.
+//
+// Parameters:
+// - bA: The first byte slice.
+// - bB: The second byte slice.
+// - maxError: The maximum allowed error. If set to -1, no limit is applied.
+// - endgapfree: A boolean flag indicating whether the LCS should be end-gap free.
+// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created.
+//
+// Returns:
+// - The score of the LCS.
+// - The length of the LCS.
 func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int) {
 
 	lA := len(bA)
@@ -316,10 +338,48 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
 	return s, l
 }
 
+// FastLCSEGFScore calculates the score of the longest common subsequence between two bio sequences in end-gap-free mode.
+//
+// if maxError > 0, the maximum allowed error between the sequences is maxError.
+// Otherwise, no error checking is done.
+// If the actual number of errors is larger than maxError, -1 is returned for both values.
+//
+// The score matrix is:
+//   - Matching: 1
+//   - Mismatch or gap: 0
+//
+// Compared to FastLCSScoreByte the length of the shortest alignment returned does not include the end-gaps.
+//
+// if buffer != nil, the buffer is used to store intermediate results.
+// Otherwise, a new buffer is allocated.
+//
+// seqA: The first bio sequence.
+// seqB: The second bio sequence.
+// maxError: The maximum allowed error between the sequences.
+// buffer: A buffer to store intermediate results.
+// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
 func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
 	return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer)
 }
 
+// FastLCSScore calculates the score of the longest common subsequence between two bio sequences.
+//
+// if maxError > 0, the maximum allowed error between the sequences is maxError.
+// Otherwise, no error checking is done.
+// If the actual number of errors is larger than maxError, -1 is returned for both values.
+//
+// The score matrix is:
+//   - Matching: 1
+//   - Mismatch or gap: 0
+//
+// if buffer != nil, the buffer is used to store intermediate results.
+// Otherwise, a new buffer is allocated.
+//
+// seqA: The first bio sequence.
+// seqB: The second bio sequence.
+// maxError: The maximum allowed error between the sequences.
+// buffer: A buffer to store intermediate results.
+// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
 func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
 	return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer)
 }
diff --git a/pkg/obiformats/ecopcr_read.go b/pkg/obiformats/ecopcr_read.go
index 96b68b7..1dae062 100644
--- a/pkg/obiformats/ecopcr_read.go
+++ b/pkg/obiformats/ecopcr_read.go
@@ -1,9 +1,9 @@
 package obiformats
 
 import (
-	"compress/gzip"
 	"encoding/csv"
 	"fmt"
+	gzip "github.com/klauspost/pgzip"
 	"io"
 	"os"
 	"path"
diff --git a/pkg/obiformats/universal_read.go b/pkg/obiformats/universal_read.go
index 5143303..f091ecb 100644
--- a/pkg/obiformats/universal_read.go
+++ b/pkg/obiformats/universal_read.go
@@ -2,7 +2,7 @@ package obiformats
 
 import (
 	"bufio"
-	"compress/gzip"
+	gzip "github.com/klauspost/pgzip"
 	"io"
 	"os"
 	"path"
@@ -52,7 +52,7 @@ func ReadSequencesFromFile(filename string,
 	var err error
 
 	options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
-	
+
 	file, err = os.Open(filename)
 
 	if err != nil {
diff --git a/pkg/obiiter/workers.go b/pkg/obiiter/workers.go
index b7adc6e..469895b 100644
--- a/pkg/obiiter/workers.go
+++ b/pkg/obiiter/workers.go
@@ -3,6 +3,7 @@ package obiiter
 import (
 	log "github.com/sirupsen/logrus"
 
+	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
 	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
 )
 
@@ -15,7 +16,7 @@ import (
 //   - First is allowing to indicates the number of workers running in parallele (default 4)
 //   - The second the size of the chanel buffer. By default set to the same value than the input buffer.
 func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int) IBioSequence {
-	nworkers := 4
+	nworkers := obioptions.CLIParallelWorkers()
 
 	if len(sizes) > 0 {
 		nworkers = sizes[0]
diff --git a/pkg/obiutils/array.go b/pkg/obiutils/array.go
new file mode 100644
index 0000000..0fa4346
--- /dev/null
+++ b/pkg/obiutils/array.go
@@ -0,0 +1,40 @@
+package obiutils
+
+// Matrix is a generic type representing a matrix.
+type Matrix[T any] [][]T
+
+// Make2DArray generates a 2D array of type T with the specified number of rows and columns.
+//
+// Data is stored in a contiguous memory block in row-major order.
+//
+// Parameters:
+// - rows: the number of rows in the 2D array.
+// - cols: the number of columns in the 2D array.
+//
+// Return:
+// - Matrix[T]: the generated 2D array of type T.
+func Make2DArray[T any](rows, cols int) Matrix[T] {
+	matrix := make(Matrix[T], rows)
+	data := make([]T, cols*rows)
+	for i := 0; i < rows; i++ {
+		matrix[i] = data[i*cols : (i+1)*cols]
+	}
+	return matrix
+}
+
+// Row returns the i-th row of the matrix.
+//
+// Parameters:
+//
+//	i - the index of the row to retrieve.
+//
+// Return:
+//
+//	[]T - the i-th row of the matrix.
+func (matrix *Matrix[T]) Column(i int) []T {
+	r := make([]T, len(*matrix))
+	for j := 0; j < len(*matrix); j++ {
+		r[j] = (*matrix)[j][i]
+	}
+	return r
+}
diff --git a/pkg/obiutils/gzipfile.go b/pkg/obiutils/gzipfile.go
index b535b4c..ed2c74e 100644
--- a/pkg/obiutils/gzipfile.go
+++ b/pkg/obiutils/gzipfile.go
@@ -2,7 +2,7 @@ package obiutils
 
 import (
 	"bufio"
-	"compress/gzip"
+	gzip "github.com/klauspost/pgzip"
 	"io"
 	"os"
 )
diff --git a/pkg/obiutils/set.go b/pkg/obiutils/set.go
new file mode 100644
index 0000000..4a9ef52
--- /dev/null
+++ b/pkg/obiutils/set.go
@@ -0,0 +1,99 @@
+package obiutils
+
+import "fmt"
+
+type Set[E comparable] map[E]struct{}
+
+// MakeSet creates a new Set with the provided values.
+//
+// The function takes a variadic parameter `vals` of type `E` which represents the values
+// that will be added to the Set.
+//
+// The function returns a Set of type `Set[E]` which contains the provided values.
+func MakeSet[E comparable](vals ...E) Set[E] {
+	s := Set[E]{}
+	for _, v := range vals {
+		s[v] = struct{}{}
+	}
+	return s
+}
+
+// NewSet creates a new Set with the given values.
+//
+// It takes a variadic parameter of type E, where E is a comparable type.
+// It returns a pointer to a Set of type E.
+func NewSet[E comparable](vals ...E) *Set[E] {
+	s := MakeSet[E](vals...)
+	return &s
+}
+
+// Add adds the given values to the set.
+//
+// It takes a variadic parameter `vals` of type `E`.
+// There is no return type for this function.
+func (s Set[E]) Add(vals ...E) {
+	for _, v := range vals {
+		s[v] = struct{}{}
+	}
+}
+
+// Contains checks if the set contains a given element.
+//
+// Parameters:
+// - v: the element to check for presence in the set.
+//
+// Returns:
+// - bool: true if the set contains the given element, false otherwise.
+func (s Set[E]) Contains(v E) bool {
+	_, ok := s[v]
+	return ok
+}
+
+// Members returns a slice of all the elements in the set.
+//
+// It does not modify the original set.
+// It returns a slice of type []E.
+func (s Set[E]) Members() []E {
+	result := make([]E, 0, len(s))
+	for v := range s {
+		result = append(result, v)
+	}
+	return result
+}
+
+// String returns a string representation of the set.
+//
+// It returns a string representation of the set by formatting the set's members using the fmt.Sprintf function.
+// The resulting string is then returned.
+func (s Set[E]) String() string {
+	return fmt.Sprintf("%v", s.Members())
+}
+
+// Union returns a new set that is the union of the current set and the specified set.
+//
+// Parameters:
+// - s2: the set to be unioned with the current set.
+//
+// Return:
+// - Set[E]: the resulting set after the union operation.
+func (s Set[E]) Union(s2 Set[E]) Set[E] {
+	result := MakeSet(s.Members()...)
+	result.Add(s2.Members()...)
+	return result
+}
+
+// Intersection returns a new set that contains the common elements between the current set and another set.
+//
+// Parameter:
+// - s2: the other set to compare with.
+// Return:
+// - Set[E]: a new set that contains the common elements.
+func (s Set[E]) Intersection(s2 Set[E]) Set[E] {
+	result := MakeSet[E]()
+	for _, v := range s.Members() {
+		if s2.Contains(v) {
+			result.Add(v)
+		}
+	}
+	return result
+}