Add some doc and switch to the parallel gzip library

Former-commit-id: 2c1187001f989ba3de5895f516d4c8b54d52a4c4
This commit is contained in:
2023-08-25 14:36:38 +02:00
parent 8a98210103
commit 2a11adb346
8 changed files with 233 additions and 5 deletions

View File

@ -29,20 +29,39 @@ var _NucPartMatch [32][32]float64
var _NucScorePartMatchMatch [100][100]int
var _NucScorePartMatchMismatch [100][100]int
// _MatchRatio calculates the match ratio between two bytes.
//
// It takes two parameters, a and b, which are bytes to be compared.
// The function returns a float64 value representing the match ratio.
func _MatchRatio(a, b byte) float64 {
// count of common bits
cm := _FourBitsCount[a&b&15]
// count of bits in a
ca := _FourBitsCount[a&15]
// count of bits in b
cb := _FourBitsCount[b&15]
// check if any of the counts is zero
if cm == 0 || ca == 0 || cb == 0 {
return float64(0)
}
// calculate the match ratio
return float64(cm) / float64(ca) / float64(cb)
}
// _Logaddexp calculates the logarithm of the sum of exponentials of two given numbers.
//
// Parameters:
//
// a - the first number (float64)
// b - the second number (float64)
//
// Returns:
//
// float64 - the result of the calculation
func _Logaddexp(a, b float64) float64 {
if a > b {
a, b = b, a
@ -51,6 +70,15 @@ func _Logaddexp(a, b float64) float64 {
return b + math.Log1p(math.Exp(a-b))
}
// _MatchScoreRatio calculates the match score ratio between two bytes.
//
// Parameters:
// - a: the first byte
// - b: the second byte
//
// Returns:
// - float64: the match score ratio when a match is observed
// - float64: the match score ratio when a mismatch is observed
func _MatchScoreRatio(a, b byte) (float64, float64) {
l2 := math.Log(2)

View File

@ -31,6 +31,28 @@ func _samenuc(a, b byte) bool {
}
return a == b
}
// FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices.
//
// The score is calculated using the following scoring matrix:
// - Match : +1
// - Mismatch and gap: 0
//
// The LCS is calculated using the Needleman-Wunsch algorithm.
// At the same time the length of the shortest path between the two sequences is calculated.
// If the endgapfree flag is set to true, the returned length does not include the end gaps.
// If the number of mismatches or gaps is larger than the maximum allowed error, -1 is returned for both.
//
// Parameters:
// - bA: The first byte slice.
// - bB: The second byte slice.
// - maxError: The maximum allowed error. If set to -1, no limit is applied.
// - endgapfree: A boolean flag indicating whether the LCS should be end-gap free.
// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created.
//
// Returns:
// - The score of the LCS.
// - The length of the LCS.
func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int) {
lA := len(bA)
@ -316,10 +338,48 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
return s, l
}
// FastLCSEGFScore calculates the score of the longest common subsequence between two bio sequences in end-gap-free mode.
//
// if maxError > 0, the maximum allowed error between the sequences is maxError.
// Otherwise, no error checking is done.
// If the actual number of errors is larger than maxError, -1 is returned for both values.
//
// The score matrix is:
// - Matching: 1
// - Mismatch or gap: 0
//
// Compared to FastLCSScoreByte the length of the shortest alignment returned does not include the end-gaps.
//
// if buffer != nil, the buffer is used to store intermediate results.
// Otherwise, a new buffer is allocated.
//
// seqA: The first bio sequence.
// seqB: The second bio sequence.
// maxError: The maximum allowed error between the sequences.
// buffer: A buffer to store intermediate results.
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer)
}
// FastLCSScore calculates the score of the longest common subsequence between two bio sequences.
//
// if maxError > 0, the maximum allowed error between the sequences is maxError.
// Otherwise, no error checking is done.
// If the actual number of errors is larger than maxError, -1 is returned for both values.
//
// The score matrix is:
// - Matching: 1
// - Mismatch or gap: 0
//
// if buffer != nil, the buffer is used to store intermediate results.
// Otherwise, a new buffer is allocated.
//
// seqA: The first bio sequence.
// seqB: The second bio sequence.
// maxError: The maximum allowed error between the sequences.
// buffer: A buffer to store intermediate results.
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer)
}

View File

@ -1,9 +1,9 @@
package obiformats
import (
"compress/gzip"
"encoding/csv"
"fmt"
gzip "github.com/klauspost/pgzip"
"io"
"os"
"path"

View File

@ -2,7 +2,7 @@ package obiformats
import (
"bufio"
"compress/gzip"
gzip "github.com/klauspost/pgzip"
"io"
"os"
"path"
@ -52,7 +52,7 @@ func ReadSequencesFromFile(filename string,
var err error
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
file, err = os.Open(filename)
if err != nil {

View File

@ -3,6 +3,7 @@ package obiiter
import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
@ -15,7 +16,7 @@ import (
// - First is allowing to indicates the number of workers running in parallele (default 4)
// - The second the size of the chanel buffer. By default set to the same value than the input buffer.
func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int) IBioSequence {
nworkers := 4
nworkers := obioptions.CLIParallelWorkers()
if len(sizes) > 0 {
nworkers = sizes[0]

40
pkg/obiutils/array.go Normal file
View File

@ -0,0 +1,40 @@
package obiutils
// Matrix is a generic type representing a matrix.
type Matrix[T any] [][]T
// Make2DArray generates a 2D array of type T with the specified number of rows and columns.
//
// Data is stored in a contiguous memory block in row-major order.
//
// Parameters:
// - rows: the number of rows in the 2D array.
// - cols: the number of columns in the 2D array.
//
// Return:
// - Matrix[T]: the generated 2D array of type T.
func Make2DArray[T any](rows, cols int) Matrix[T] {
matrix := make(Matrix[T], rows)
data := make([]T, cols*rows)
for i := 0; i < rows; i++ {
matrix[i] = data[i*cols : (i+1)*cols]
}
return matrix
}
// Row returns the i-th row of the matrix.
//
// Parameters:
//
// i - the index of the row to retrieve.
//
// Return:
//
// []T - the i-th row of the matrix.
func (matrix *Matrix[T]) Column(i int) []T {
r := make([]T, len(*matrix))
for j := 0; j < len(*matrix); j++ {
r[j] = (*matrix)[j][i]
}
return r
}

View File

@ -2,7 +2,7 @@ package obiutils
import (
"bufio"
"compress/gzip"
gzip "github.com/klauspost/pgzip"
"io"
"os"
)

99
pkg/obiutils/set.go Normal file
View File

@ -0,0 +1,99 @@
package obiutils
import "fmt"
type Set[E comparable] map[E]struct{}
// MakeSet creates a new Set with the provided values.
//
// The function takes a variadic parameter `vals` of type `E` which represents the values
// that will be added to the Set.
//
// The function returns a Set of type `Set[E]` which contains the provided values.
func MakeSet[E comparable](vals ...E) Set[E] {
s := Set[E]{}
for _, v := range vals {
s[v] = struct{}{}
}
return s
}
// NewSet creates a new Set with the given values.
//
// It takes a variadic parameter of type E, where E is a comparable type.
// It returns a pointer to a Set of type E.
func NewSet[E comparable](vals ...E) *Set[E] {
s := MakeSet[E](vals...)
return &s
}
// Add adds the given values to the set.
//
// It takes a variadic parameter `vals` of type `E`.
// There is no return type for this function.
func (s Set[E]) Add(vals ...E) {
for _, v := range vals {
s[v] = struct{}{}
}
}
// Contains checks if the set contains a given element.
//
// Parameters:
// - v: the element to check for presence in the set.
//
// Returns:
// - bool: true if the set contains the given element, false otherwise.
func (s Set[E]) Contains(v E) bool {
_, ok := s[v]
return ok
}
// Members returns a slice of all the elements in the set.
//
// It does not modify the original set.
// It returns a slice of type []E.
func (s Set[E]) Members() []E {
result := make([]E, 0, len(s))
for v := range s {
result = append(result, v)
}
return result
}
// String returns a string representation of the set.
//
// It returns a string representation of the set by formatting the set's members using the fmt.Sprintf function.
// The resulting string is then returned.
func (s Set[E]) String() string {
return fmt.Sprintf("%v", s.Members())
}
// Union returns a new set that is the union of the current set and the specified set.
//
// Parameters:
// - s2: the set to be unioned with the current set.
//
// Return:
// - Set[E]: the resulting set after the union operation.
func (s Set[E]) Union(s2 Set[E]) Set[E] {
result := MakeSet(s.Members()...)
result.Add(s2.Members()...)
return result
}
// Intersection returns a new set that contains the common elements between the current set and another set.
//
// Parameter:
// - s2: the other set to compare with.
// Return:
// - Set[E]: a new set that contains the common elements.
func (s Set[E]) Intersection(s2 Set[E]) Set[E] {
result := MakeSet[E]()
for _, v := range s.Members() {
if s2.Contains(v) {
result.Add(v)
}
}
return result
}