mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Add some doc and switch to the parallel gzip library
Former-commit-id: 2c1187001f989ba3de5895f516d4c8b54d52a4c4
This commit is contained in:
@ -29,20 +29,39 @@ var _NucPartMatch [32][32]float64
|
||||
var _NucScorePartMatchMatch [100][100]int
|
||||
var _NucScorePartMatchMismatch [100][100]int
|
||||
|
||||
// _MatchRatio calculates the match ratio between two bytes.
|
||||
//
|
||||
// It takes two parameters, a and b, which are bytes to be compared.
|
||||
// The function returns a float64 value representing the match ratio.
|
||||
func _MatchRatio(a, b byte) float64 {
|
||||
// count of common bits
|
||||
cm := _FourBitsCount[a&b&15]
|
||||
|
||||
// count of bits in a
|
||||
ca := _FourBitsCount[a&15]
|
||||
|
||||
// count of bits in b
|
||||
cb := _FourBitsCount[b&15]
|
||||
|
||||
// check if any of the counts is zero
|
||||
if cm == 0 || ca == 0 || cb == 0 {
|
||||
return float64(0)
|
||||
}
|
||||
|
||||
// calculate the match ratio
|
||||
return float64(cm) / float64(ca) / float64(cb)
|
||||
}
|
||||
|
||||
// _Logaddexp calculates the logarithm of the sum of exponentials of two given numbers.
|
||||
//
|
||||
// Parameters:
|
||||
//
|
||||
// a - the first number (float64)
|
||||
// b - the second number (float64)
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// float64 - the result of the calculation
|
||||
func _Logaddexp(a, b float64) float64 {
|
||||
if a > b {
|
||||
a, b = b, a
|
||||
@ -51,6 +70,15 @@ func _Logaddexp(a, b float64) float64 {
|
||||
return b + math.Log1p(math.Exp(a-b))
|
||||
}
|
||||
|
||||
// _MatchScoreRatio calculates the match score ratio between two bytes.
|
||||
//
|
||||
// Parameters:
|
||||
// - a: the first byte
|
||||
// - b: the second byte
|
||||
//
|
||||
// Returns:
|
||||
// - float64: the match score ratio when a match is observed
|
||||
// - float64: the match score ratio when a mismatch is observed
|
||||
func _MatchScoreRatio(a, b byte) (float64, float64) {
|
||||
|
||||
l2 := math.Log(2)
|
||||
|
@ -31,6 +31,28 @@ func _samenuc(a, b byte) bool {
|
||||
}
|
||||
return a == b
|
||||
}
|
||||
|
||||
// FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices.
|
||||
//
|
||||
// The score is calculated using the following scoring matrix:
|
||||
// - Match : +1
|
||||
// - Mismatch and gap: 0
|
||||
//
|
||||
// The LCS is calculated using the Needleman-Wunsch algorithm.
|
||||
// At the same time the length of the shortest path between the two sequences is calculated.
|
||||
// If the endgapfree flag is set to true, the returned length does not include the end gaps.
|
||||
// If the number of mismatches or gaps is larger than the maximum allowed error, -1 is returned for both.
|
||||
//
|
||||
// Parameters:
|
||||
// - bA: The first byte slice.
|
||||
// - bB: The second byte slice.
|
||||
// - maxError: The maximum allowed error. If set to -1, no limit is applied.
|
||||
// - endgapfree: A boolean flag indicating whether the LCS should be end-gap free.
|
||||
// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created.
|
||||
//
|
||||
// Returns:
|
||||
// - The score of the LCS.
|
||||
// - The length of the LCS.
|
||||
func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int) {
|
||||
|
||||
lA := len(bA)
|
||||
@ -316,10 +338,48 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
|
||||
return s, l
|
||||
}
|
||||
|
||||
// FastLCSEGFScore calculates the score of the longest common subsequence between two bio sequences in end-gap-free mode.
|
||||
//
|
||||
// if maxError > 0, the maximum allowed error between the sequences is maxError.
|
||||
// Otherwise, no error checking is done.
|
||||
// If the actual number of errors is larger than maxError, -1 is returned for both values.
|
||||
//
|
||||
// The score matrix is:
|
||||
// - Matching: 1
|
||||
// - Mismatch or gap: 0
|
||||
//
|
||||
// Compared to FastLCSScoreByte the length of the shortest alignment returned does not include the end-gaps.
|
||||
//
|
||||
// if buffer != nil, the buffer is used to store intermediate results.
|
||||
// Otherwise, a new buffer is allocated.
|
||||
//
|
||||
// seqA: The first bio sequence.
|
||||
// seqB: The second bio sequence.
|
||||
// maxError: The maximum allowed error between the sequences.
|
||||
// buffer: A buffer to store intermediate results.
|
||||
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
|
||||
func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
|
||||
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer)
|
||||
}
|
||||
|
||||
// FastLCSScore calculates the score of the longest common subsequence between two bio sequences.
|
||||
//
|
||||
// if maxError > 0, the maximum allowed error between the sequences is maxError.
|
||||
// Otherwise, no error checking is done.
|
||||
// If the actual number of errors is larger than maxError, -1 is returned for both values.
|
||||
//
|
||||
// The score matrix is:
|
||||
// - Matching: 1
|
||||
// - Mismatch or gap: 0
|
||||
//
|
||||
// if buffer != nil, the buffer is used to store intermediate results.
|
||||
// Otherwise, a new buffer is allocated.
|
||||
//
|
||||
// seqA: The first bio sequence.
|
||||
// seqB: The second bio sequence.
|
||||
// maxError: The maximum allowed error between the sequences.
|
||||
// buffer: A buffer to store intermediate results.
|
||||
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
|
||||
func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
|
||||
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer)
|
||||
}
|
||||
|
@ -1,9 +1,9 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"compress/gzip"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
gzip "github.com/klauspost/pgzip"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
|
@ -2,7 +2,7 @@ package obiformats
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"compress/gzip"
|
||||
gzip "github.com/klauspost/pgzip"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
@ -52,7 +52,7 @@ func ReadSequencesFromFile(filename string,
|
||||
var err error
|
||||
|
||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||
|
||||
|
||||
file, err = os.Open(filename)
|
||||
|
||||
if err != nil {
|
||||
|
@ -3,6 +3,7 @@ package obiiter
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
@ -15,7 +16,7 @@ import (
|
||||
// - First is allowing to indicates the number of workers running in parallele (default 4)
|
||||
// - The second the size of the chanel buffer. By default set to the same value than the input buffer.
|
||||
func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int) IBioSequence {
|
||||
nworkers := 4
|
||||
nworkers := obioptions.CLIParallelWorkers()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
|
40
pkg/obiutils/array.go
Normal file
40
pkg/obiutils/array.go
Normal file
@ -0,0 +1,40 @@
|
||||
package obiutils
|
||||
|
||||
// Matrix is a generic type representing a matrix.
|
||||
type Matrix[T any] [][]T
|
||||
|
||||
// Make2DArray generates a 2D array of type T with the specified number of rows and columns.
|
||||
//
|
||||
// Data is stored in a contiguous memory block in row-major order.
|
||||
//
|
||||
// Parameters:
|
||||
// - rows: the number of rows in the 2D array.
|
||||
// - cols: the number of columns in the 2D array.
|
||||
//
|
||||
// Return:
|
||||
// - Matrix[T]: the generated 2D array of type T.
|
||||
func Make2DArray[T any](rows, cols int) Matrix[T] {
|
||||
matrix := make(Matrix[T], rows)
|
||||
data := make([]T, cols*rows)
|
||||
for i := 0; i < rows; i++ {
|
||||
matrix[i] = data[i*cols : (i+1)*cols]
|
||||
}
|
||||
return matrix
|
||||
}
|
||||
|
||||
// Row returns the i-th row of the matrix.
|
||||
//
|
||||
// Parameters:
|
||||
//
|
||||
// i - the index of the row to retrieve.
|
||||
//
|
||||
// Return:
|
||||
//
|
||||
// []T - the i-th row of the matrix.
|
||||
func (matrix *Matrix[T]) Column(i int) []T {
|
||||
r := make([]T, len(*matrix))
|
||||
for j := 0; j < len(*matrix); j++ {
|
||||
r[j] = (*matrix)[j][i]
|
||||
}
|
||||
return r
|
||||
}
|
@ -2,7 +2,7 @@ package obiutils
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"compress/gzip"
|
||||
gzip "github.com/klauspost/pgzip"
|
||||
"io"
|
||||
"os"
|
||||
)
|
||||
|
99
pkg/obiutils/set.go
Normal file
99
pkg/obiutils/set.go
Normal file
@ -0,0 +1,99 @@
|
||||
package obiutils
|
||||
|
||||
import "fmt"
|
||||
|
||||
type Set[E comparable] map[E]struct{}
|
||||
|
||||
// MakeSet creates a new Set with the provided values.
|
||||
//
|
||||
// The function takes a variadic parameter `vals` of type `E` which represents the values
|
||||
// that will be added to the Set.
|
||||
//
|
||||
// The function returns a Set of type `Set[E]` which contains the provided values.
|
||||
func MakeSet[E comparable](vals ...E) Set[E] {
|
||||
s := Set[E]{}
|
||||
for _, v := range vals {
|
||||
s[v] = struct{}{}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// NewSet creates a new Set with the given values.
|
||||
//
|
||||
// It takes a variadic parameter of type E, where E is a comparable type.
|
||||
// It returns a pointer to a Set of type E.
|
||||
func NewSet[E comparable](vals ...E) *Set[E] {
|
||||
s := MakeSet[E](vals...)
|
||||
return &s
|
||||
}
|
||||
|
||||
// Add adds the given values to the set.
|
||||
//
|
||||
// It takes a variadic parameter `vals` of type `E`.
|
||||
// There is no return type for this function.
|
||||
func (s Set[E]) Add(vals ...E) {
|
||||
for _, v := range vals {
|
||||
s[v] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
// Contains checks if the set contains a given element.
|
||||
//
|
||||
// Parameters:
|
||||
// - v: the element to check for presence in the set.
|
||||
//
|
||||
// Returns:
|
||||
// - bool: true if the set contains the given element, false otherwise.
|
||||
func (s Set[E]) Contains(v E) bool {
|
||||
_, ok := s[v]
|
||||
return ok
|
||||
}
|
||||
|
||||
// Members returns a slice of all the elements in the set.
|
||||
//
|
||||
// It does not modify the original set.
|
||||
// It returns a slice of type []E.
|
||||
func (s Set[E]) Members() []E {
|
||||
result := make([]E, 0, len(s))
|
||||
for v := range s {
|
||||
result = append(result, v)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// String returns a string representation of the set.
|
||||
//
|
||||
// It returns a string representation of the set by formatting the set's members using the fmt.Sprintf function.
|
||||
// The resulting string is then returned.
|
||||
func (s Set[E]) String() string {
|
||||
return fmt.Sprintf("%v", s.Members())
|
||||
}
|
||||
|
||||
// Union returns a new set that is the union of the current set and the specified set.
|
||||
//
|
||||
// Parameters:
|
||||
// - s2: the set to be unioned with the current set.
|
||||
//
|
||||
// Return:
|
||||
// - Set[E]: the resulting set after the union operation.
|
||||
func (s Set[E]) Union(s2 Set[E]) Set[E] {
|
||||
result := MakeSet(s.Members()...)
|
||||
result.Add(s2.Members()...)
|
||||
return result
|
||||
}
|
||||
|
||||
// Intersection returns a new set that contains the common elements between the current set and another set.
|
||||
//
|
||||
// Parameter:
|
||||
// - s2: the other set to compare with.
|
||||
// Return:
|
||||
// - Set[E]: a new set that contains the common elements.
|
||||
func (s Set[E]) Intersection(s2 Set[E]) Set[E] {
|
||||
result := MakeSet[E]()
|
||||
for _, v := range s.Members() {
|
||||
if s2.Contains(v) {
|
||||
result.Add(v)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
Reference in New Issue
Block a user