Add some doc and switch to the parallel gzip library

Former-commit-id: 2c1187001f989ba3de5895f516d4c8b54d52a4c4
2025-06-29 16:20:46 +00:00 · 2023-08-25 14:36:38 +02:00
parent 8a98210103
commit 2a11adb346
8 changed files with 233 additions and 5 deletions
--- a/pkg/obialign/dnamatrix.go
+++ b/pkg/obialign/dnamatrix.go
@ -29,20 +29,39 @@ var _NucPartMatch [32][32]float64
 var _NucScorePartMatchMatch [100][100]int
 var _NucScorePartMatchMismatch [100][100]int
 // _MatchRatio calculates the match ratio between two bytes.
 //
 // It takes two parameters, a and b, which are bytes to be compared.
 // The function returns a float64 value representing the match ratio.
 func _MatchRatio(a, b byte) float64 {
 	// count of common bits
 	cm := _FourBitsCount[a&b&15]
 	// count of bits in a
 	ca := _FourBitsCount[a&15]
 	// count of bits in b
 	cb := _FourBitsCount[b&15]
 	// check if any of the counts is zero
 	if cm == 0 || ca == 0 || cb == 0 {
 		return float64(0)
 	}
 	// calculate the match ratio
 	return float64(cm) / float64(ca) / float64(cb)
 }
 // _Logaddexp calculates the logarithm of the sum of exponentials of two given numbers.
 //
 // Parameters:
 //
 //	a - the first number (float64)
 //	b - the second number (float64)
 //
 // Returns:
 //
 //	float64 - the result of the calculation
 func _Logaddexp(a, b float64) float64 {
 	if a > b {
 		a, b = b, a
@ -51,6 +70,15 @@ func _Logaddexp(a, b float64) float64 {
 	return b + math.Log1p(math.Exp(a-b))
 }
 // _MatchScoreRatio calculates the match score ratio between two bytes.
 //
 // Parameters:
 // - a: the first byte
 // - b: the second byte
 //
 // Returns:
 // - float64: the match score ratio when a match is observed
 // - float64: the match score ratio when a mismatch is observed
 func _MatchScoreRatio(a, b byte) (float64, float64) {
 	l2 := math.Log(2)
--- a/pkg/obialign/fastlcsegf.go
+++ b/pkg/obialign/fastlcsegf.go
@ -31,6 +31,28 @@ func _samenuc(a, b byte) bool {
 	}
 	return a == b
 }
 // FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices.
 //
 // The score is calculated using the following scoring matrix:
 //   - Match : +1
 //   - Mismatch and gap: 0
 //
 // The LCS is calculated using the Needleman-Wunsch algorithm.
 // At the same time the length of the shortest path between the two sequences is calculated.
 // If the endgapfree flag is set to true, the returned length does not include the end gaps.
 // If the number of mismatches or gaps is larger than the maximum allowed error, -1 is returned for both.
 //
 // Parameters:
 // - bA: The first byte slice.
 // - bB: The second byte slice.
 // - maxError: The maximum allowed error. If set to -1, no limit is applied.
 // - endgapfree: A boolean flag indicating whether the LCS should be end-gap free.
 // - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created.
 //
 // Returns:
 // - The score of the LCS.
 // - The length of the LCS.
 func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int) {
 	lA := len(bA)
@ -316,10 +338,48 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
 	return s, l
 }
 // FastLCSEGFScore calculates the score of the longest common subsequence between two bio sequences in end-gap-free mode.
 //
 // if maxError > 0, the maximum allowed error between the sequences is maxError.
 // Otherwise, no error checking is done.
 // If the actual number of errors is larger than maxError, -1 is returned for both values.
 //
 // The score matrix is:
 //   - Matching: 1
 //   - Mismatch or gap: 0
 //
 // Compared to FastLCSScoreByte the length of the shortest alignment returned does not include the end-gaps.
 //
 // if buffer != nil, the buffer is used to store intermediate results.
 // Otherwise, a new buffer is allocated.
 //
 // seqA: The first bio sequence.
 // seqB: The second bio sequence.
 // maxError: The maximum allowed error between the sequences.
 // buffer: A buffer to store intermediate results.
 // Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
 func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
 	return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer)
 }
 // FastLCSScore calculates the score of the longest common subsequence between two bio sequences.
 //
 // if maxError > 0, the maximum allowed error between the sequences is maxError.
 // Otherwise, no error checking is done.
 // If the actual number of errors is larger than maxError, -1 is returned for both values.
 //
 // The score matrix is:
 //   - Matching: 1
 //   - Mismatch or gap: 0
 //
 // if buffer != nil, the buffer is used to store intermediate results.
 // Otherwise, a new buffer is allocated.
 //
 // seqA: The first bio sequence.
 // seqB: The second bio sequence.
 // maxError: The maximum allowed error between the sequences.
 // buffer: A buffer to store intermediate results.
 // Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
 func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
 	return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer)
 }
--- a/pkg/obiformats/ecopcr_read.go
+++ b/pkg/obiformats/ecopcr_read.go
@ -1,9 +1,9 @@
 package obiformats
 import (
 	"compress/gzip"
 	"encoding/csv"
 	"fmt"
 	gzip "github.com/klauspost/pgzip"
 	"io"
 	"os"
 	"path"
--- a/pkg/obiformats/universal_read.go
+++ b/pkg/obiformats/universal_read.go
@ -2,7 +2,7 @@ package obiformats
 import (
 	"bufio"
-	"compress/gzip"
+	gzip "github.com/klauspost/pgzip"
 	"io"
 	"os"
 	"path"
@ -52,7 +52,7 @@ func ReadSequencesFromFile(filename string,
 	var err error
 	options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
-	
+
 	file, err = os.Open(filename)
 	if err != nil {
--- a/pkg/obiiter/workers.go
+++ b/pkg/obiiter/workers.go
@ -3,6 +3,7 @@ package obiiter
 import (
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
 	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
 )
@ -15,7 +16,7 @@ import (
 //   - First is allowing to indicates the number of workers running in parallele (default 4)
 //   - The second the size of the chanel buffer. By default set to the same value than the input buffer.
 func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int) IBioSequence {
-	nworkers := 4
+	nworkers := obioptions.CLIParallelWorkers()
 	if len(sizes) > 0 {
 		nworkers = sizes[0]
--- a/pkg/obiutils/array.go
+++ b/pkg/obiutils/array.go
@ -0,0 +1,40 @@
 package obiutils
 // Matrix is a generic type representing a matrix.
 type Matrix[T any] [][]T
 // Make2DArray generates a 2D array of type T with the specified number of rows and columns.
 //
 // Data is stored in a contiguous memory block in row-major order.
 //
 // Parameters:
 // - rows: the number of rows in the 2D array.
 // - cols: the number of columns in the 2D array.
 //
 // Return:
 // - Matrix[T]: the generated 2D array of type T.
 func Make2DArray[T any](rows, cols int) Matrix[T] {
 	matrix := make(Matrix[T], rows)
 	data := make([]T, cols*rows)
 	for i := 0; i < rows; i++ {
 		matrix[i] = data[i*cols : (i+1)*cols]
 	}
 	return matrix
 }
 // Row returns the i-th row of the matrix.
 //
 // Parameters:
 //
 //	i - the index of the row to retrieve.
 //
 // Return:
 //
 //	[]T - the i-th row of the matrix.
 func (matrix *Matrix[T]) Column(i int) []T {
 	r := make([]T, len(*matrix))
 	for j := 0; j < len(*matrix); j++ {
 		r[j] = (*matrix)[j][i]
 	}
 	return r
 }
--- a/pkg/obiutils/gzipfile.go
+++ b/pkg/obiutils/gzipfile.go
@ -2,7 +2,7 @@ package obiutils
 import (
 	"bufio"
-	"compress/gzip"
+	gzip "github.com/klauspost/pgzip"
 	"io"
 	"os"
 )
--- a/pkg/obiutils/set.go
+++ b/pkg/obiutils/set.go
@ -0,0 +1,99 @@
 package obiutils
 import "fmt"
 type Set[E comparable] map[E]struct{}
 // MakeSet creates a new Set with the provided values.
 //
 // The function takes a variadic parameter `vals` of type `E` which represents the values
 // that will be added to the Set.
 //
 // The function returns a Set of type `Set[E]` which contains the provided values.
 func MakeSet[E comparable](vals ...E) Set[E] {
 	s := Set[E]{}
 	for _, v := range vals {
 		s[v] = struct{}{}
 	}
 	return s
 }
 // NewSet creates a new Set with the given values.
 //
 // It takes a variadic parameter of type E, where E is a comparable type.
 // It returns a pointer to a Set of type E.
 func NewSet[E comparable](vals ...E) *Set[E] {
 	s := MakeSet[E](vals...)
 	return &s
 }
 // Add adds the given values to the set.
 //
 // It takes a variadic parameter `vals` of type `E`.
 // There is no return type for this function.
 func (s Set[E]) Add(vals ...E) {
 	for _, v := range vals {
 		s[v] = struct{}{}
 	}
 }
 // Contains checks if the set contains a given element.
 //
 // Parameters:
 // - v: the element to check for presence in the set.
 //
 // Returns:
 // - bool: true if the set contains the given element, false otherwise.
 func (s Set[E]) Contains(v E) bool {
 	_, ok := s[v]
 	return ok
 }
 // Members returns a slice of all the elements in the set.
 //
 // It does not modify the original set.
 // It returns a slice of type []E.
 func (s Set[E]) Members() []E {
 	result := make([]E, 0, len(s))
 	for v := range s {
 		result = append(result, v)
 	}
 	return result
 }
 // String returns a string representation of the set.
 //
 // It returns a string representation of the set by formatting the set's members using the fmt.Sprintf function.
 // The resulting string is then returned.
 func (s Set[E]) String() string {
 	return fmt.Sprintf("%v", s.Members())
 }
 // Union returns a new set that is the union of the current set and the specified set.
 //
 // Parameters:
 // - s2: the set to be unioned with the current set.
 //
 // Return:
 // - Set[E]: the resulting set after the union operation.
 func (s Set[E]) Union(s2 Set[E]) Set[E] {
 	result := MakeSet(s.Members()...)
 	result.Add(s2.Members()...)
 	return result
 }
 // Intersection returns a new set that contains the common elements between the current set and another set.
 //
 // Parameter:
 // - s2: the other set to compare with.
 // Return:
 // - Set[E]: a new set that contains the common elements.
 func (s Set[E]) Intersection(s2 Set[E]) Set[E] {
 	result := MakeSet[E]()
 	for _, v := range s.Members() {
 		if s2.Contains(v) {
 			result.Add(v)
 		}
 	}
 	return result
 }