mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Add some doc and switch to the parallel gzip library
Former-commit-id: 2c1187001f989ba3de5895f516d4c8b54d52a4c4
This commit is contained in:
@ -29,20 +29,39 @@ var _NucPartMatch [32][32]float64
|
|||||||
var _NucScorePartMatchMatch [100][100]int
|
var _NucScorePartMatchMatch [100][100]int
|
||||||
var _NucScorePartMatchMismatch [100][100]int
|
var _NucScorePartMatchMismatch [100][100]int
|
||||||
|
|
||||||
|
// _MatchRatio calculates the match ratio between two bytes.
|
||||||
|
//
|
||||||
|
// It takes two parameters, a and b, which are bytes to be compared.
|
||||||
|
// The function returns a float64 value representing the match ratio.
|
||||||
func _MatchRatio(a, b byte) float64 {
|
func _MatchRatio(a, b byte) float64 {
|
||||||
// count of common bits
|
// count of common bits
|
||||||
cm := _FourBitsCount[a&b&15]
|
cm := _FourBitsCount[a&b&15]
|
||||||
|
|
||||||
|
// count of bits in a
|
||||||
ca := _FourBitsCount[a&15]
|
ca := _FourBitsCount[a&15]
|
||||||
|
|
||||||
|
// count of bits in b
|
||||||
cb := _FourBitsCount[b&15]
|
cb := _FourBitsCount[b&15]
|
||||||
|
|
||||||
|
// check if any of the counts is zero
|
||||||
if cm == 0 || ca == 0 || cb == 0 {
|
if cm == 0 || ca == 0 || cb == 0 {
|
||||||
return float64(0)
|
return float64(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// calculate the match ratio
|
||||||
return float64(cm) / float64(ca) / float64(cb)
|
return float64(cm) / float64(ca) / float64(cb)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// _Logaddexp calculates the logarithm of the sum of exponentials of two given numbers.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
//
|
||||||
|
// a - the first number (float64)
|
||||||
|
// b - the second number (float64)
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
//
|
||||||
|
// float64 - the result of the calculation
|
||||||
func _Logaddexp(a, b float64) float64 {
|
func _Logaddexp(a, b float64) float64 {
|
||||||
if a > b {
|
if a > b {
|
||||||
a, b = b, a
|
a, b = b, a
|
||||||
@ -51,6 +70,15 @@ func _Logaddexp(a, b float64) float64 {
|
|||||||
return b + math.Log1p(math.Exp(a-b))
|
return b + math.Log1p(math.Exp(a-b))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// _MatchScoreRatio calculates the match score ratio between two bytes.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - a: the first byte
|
||||||
|
// - b: the second byte
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - float64: the match score ratio when a match is observed
|
||||||
|
// - float64: the match score ratio when a mismatch is observed
|
||||||
func _MatchScoreRatio(a, b byte) (float64, float64) {
|
func _MatchScoreRatio(a, b byte) (float64, float64) {
|
||||||
|
|
||||||
l2 := math.Log(2)
|
l2 := math.Log(2)
|
||||||
|
@ -31,6 +31,28 @@ func _samenuc(a, b byte) bool {
|
|||||||
}
|
}
|
||||||
return a == b
|
return a == b
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices.
|
||||||
|
//
|
||||||
|
// The score is calculated using the following scoring matrix:
|
||||||
|
// - Match : +1
|
||||||
|
// - Mismatch and gap: 0
|
||||||
|
//
|
||||||
|
// The LCS is calculated using the Needleman-Wunsch algorithm.
|
||||||
|
// At the same time the length of the shortest path between the two sequences is calculated.
|
||||||
|
// If the endgapfree flag is set to true, the returned length does not include the end gaps.
|
||||||
|
// If the number of mismatches or gaps is larger than the maximum allowed error, -1 is returned for both.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - bA: The first byte slice.
|
||||||
|
// - bB: The second byte slice.
|
||||||
|
// - maxError: The maximum allowed error. If set to -1, no limit is applied.
|
||||||
|
// - endgapfree: A boolean flag indicating whether the LCS should be end-gap free.
|
||||||
|
// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created.
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - The score of the LCS.
|
||||||
|
// - The length of the LCS.
|
||||||
func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int) {
|
func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int) {
|
||||||
|
|
||||||
lA := len(bA)
|
lA := len(bA)
|
||||||
@ -316,10 +338,48 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
|
|||||||
return s, l
|
return s, l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FastLCSEGFScore calculates the score of the longest common subsequence between two bio sequences in end-gap-free mode.
|
||||||
|
//
|
||||||
|
// if maxError > 0, the maximum allowed error between the sequences is maxError.
|
||||||
|
// Otherwise, no error checking is done.
|
||||||
|
// If the actual number of errors is larger than maxError, -1 is returned for both values.
|
||||||
|
//
|
||||||
|
// The score matrix is:
|
||||||
|
// - Matching: 1
|
||||||
|
// - Mismatch or gap: 0
|
||||||
|
//
|
||||||
|
// Compared to FastLCSScoreByte the length of the shortest alignment returned does not include the end-gaps.
|
||||||
|
//
|
||||||
|
// if buffer != nil, the buffer is used to store intermediate results.
|
||||||
|
// Otherwise, a new buffer is allocated.
|
||||||
|
//
|
||||||
|
// seqA: The first bio sequence.
|
||||||
|
// seqB: The second bio sequence.
|
||||||
|
// maxError: The maximum allowed error between the sequences.
|
||||||
|
// buffer: A buffer to store intermediate results.
|
||||||
|
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
|
||||||
func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
|
func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
|
||||||
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer)
|
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FastLCSScore calculates the score of the longest common subsequence between two bio sequences.
|
||||||
|
//
|
||||||
|
// if maxError > 0, the maximum allowed error between the sequences is maxError.
|
||||||
|
// Otherwise, no error checking is done.
|
||||||
|
// If the actual number of errors is larger than maxError, -1 is returned for both values.
|
||||||
|
//
|
||||||
|
// The score matrix is:
|
||||||
|
// - Matching: 1
|
||||||
|
// - Mismatch or gap: 0
|
||||||
|
//
|
||||||
|
// if buffer != nil, the buffer is used to store intermediate results.
|
||||||
|
// Otherwise, a new buffer is allocated.
|
||||||
|
//
|
||||||
|
// seqA: The first bio sequence.
|
||||||
|
// seqB: The second bio sequence.
|
||||||
|
// maxError: The maximum allowed error between the sequences.
|
||||||
|
// buffer: A buffer to store intermediate results.
|
||||||
|
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
|
||||||
func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
|
func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
|
||||||
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer)
|
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer)
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
package obiformats
|
package obiformats
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"compress/gzip"
|
|
||||||
"encoding/csv"
|
"encoding/csv"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
gzip "github.com/klauspost/pgzip"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
|
@ -2,7 +2,7 @@ package obiformats
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"compress/gzip"
|
gzip "github.com/klauspost/pgzip"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
@ -52,7 +52,7 @@ func ReadSequencesFromFile(filename string,
|
|||||||
var err error
|
var err error
|
||||||
|
|
||||||
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
file, err = os.Open(filename)
|
file, err = os.Open(filename)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -3,6 +3,7 @@ package obiiter
|
|||||||
import (
|
import (
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -15,7 +16,7 @@ import (
|
|||||||
// - First is allowing to indicates the number of workers running in parallele (default 4)
|
// - First is allowing to indicates the number of workers running in parallele (default 4)
|
||||||
// - The second the size of the chanel buffer. By default set to the same value than the input buffer.
|
// - The second the size of the chanel buffer. By default set to the same value than the input buffer.
|
||||||
func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int) IBioSequence {
|
func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int) IBioSequence {
|
||||||
nworkers := 4
|
nworkers := obioptions.CLIParallelWorkers()
|
||||||
|
|
||||||
if len(sizes) > 0 {
|
if len(sizes) > 0 {
|
||||||
nworkers = sizes[0]
|
nworkers = sizes[0]
|
||||||
|
40
pkg/obiutils/array.go
Normal file
40
pkg/obiutils/array.go
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
package obiutils
|
||||||
|
|
||||||
|
// Matrix is a generic type representing a matrix.
|
||||||
|
type Matrix[T any] [][]T
|
||||||
|
|
||||||
|
// Make2DArray generates a 2D array of type T with the specified number of rows and columns.
|
||||||
|
//
|
||||||
|
// Data is stored in a contiguous memory block in row-major order.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - rows: the number of rows in the 2D array.
|
||||||
|
// - cols: the number of columns in the 2D array.
|
||||||
|
//
|
||||||
|
// Return:
|
||||||
|
// - Matrix[T]: the generated 2D array of type T.
|
||||||
|
func Make2DArray[T any](rows, cols int) Matrix[T] {
|
||||||
|
matrix := make(Matrix[T], rows)
|
||||||
|
data := make([]T, cols*rows)
|
||||||
|
for i := 0; i < rows; i++ {
|
||||||
|
matrix[i] = data[i*cols : (i+1)*cols]
|
||||||
|
}
|
||||||
|
return matrix
|
||||||
|
}
|
||||||
|
|
||||||
|
// Row returns the i-th row of the matrix.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
//
|
||||||
|
// i - the index of the row to retrieve.
|
||||||
|
//
|
||||||
|
// Return:
|
||||||
|
//
|
||||||
|
// []T - the i-th row of the matrix.
|
||||||
|
func (matrix *Matrix[T]) Column(i int) []T {
|
||||||
|
r := make([]T, len(*matrix))
|
||||||
|
for j := 0; j < len(*matrix); j++ {
|
||||||
|
r[j] = (*matrix)[j][i]
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}
|
@ -2,7 +2,7 @@ package obiutils
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"compress/gzip"
|
gzip "github.com/klauspost/pgzip"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
)
|
)
|
||||||
|
99
pkg/obiutils/set.go
Normal file
99
pkg/obiutils/set.go
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
package obiutils
|
||||||
|
|
||||||
|
import "fmt"
|
||||||
|
|
||||||
|
type Set[E comparable] map[E]struct{}
|
||||||
|
|
||||||
|
// MakeSet creates a new Set with the provided values.
|
||||||
|
//
|
||||||
|
// The function takes a variadic parameter `vals` of type `E` which represents the values
|
||||||
|
// that will be added to the Set.
|
||||||
|
//
|
||||||
|
// The function returns a Set of type `Set[E]` which contains the provided values.
|
||||||
|
func MakeSet[E comparable](vals ...E) Set[E] {
|
||||||
|
s := Set[E]{}
|
||||||
|
for _, v := range vals {
|
||||||
|
s[v] = struct{}{}
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewSet creates a new Set with the given values.
|
||||||
|
//
|
||||||
|
// It takes a variadic parameter of type E, where E is a comparable type.
|
||||||
|
// It returns a pointer to a Set of type E.
|
||||||
|
func NewSet[E comparable](vals ...E) *Set[E] {
|
||||||
|
s := MakeSet[E](vals...)
|
||||||
|
return &s
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add adds the given values to the set.
|
||||||
|
//
|
||||||
|
// It takes a variadic parameter `vals` of type `E`.
|
||||||
|
// There is no return type for this function.
|
||||||
|
func (s Set[E]) Add(vals ...E) {
|
||||||
|
for _, v := range vals {
|
||||||
|
s[v] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Contains checks if the set contains a given element.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - v: the element to check for presence in the set.
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - bool: true if the set contains the given element, false otherwise.
|
||||||
|
func (s Set[E]) Contains(v E) bool {
|
||||||
|
_, ok := s[v]
|
||||||
|
return ok
|
||||||
|
}
|
||||||
|
|
||||||
|
// Members returns a slice of all the elements in the set.
|
||||||
|
//
|
||||||
|
// It does not modify the original set.
|
||||||
|
// It returns a slice of type []E.
|
||||||
|
func (s Set[E]) Members() []E {
|
||||||
|
result := make([]E, 0, len(s))
|
||||||
|
for v := range s {
|
||||||
|
result = append(result, v)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// String returns a string representation of the set.
|
||||||
|
//
|
||||||
|
// It returns a string representation of the set by formatting the set's members using the fmt.Sprintf function.
|
||||||
|
// The resulting string is then returned.
|
||||||
|
func (s Set[E]) String() string {
|
||||||
|
return fmt.Sprintf("%v", s.Members())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Union returns a new set that is the union of the current set and the specified set.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - s2: the set to be unioned with the current set.
|
||||||
|
//
|
||||||
|
// Return:
|
||||||
|
// - Set[E]: the resulting set after the union operation.
|
||||||
|
func (s Set[E]) Union(s2 Set[E]) Set[E] {
|
||||||
|
result := MakeSet(s.Members()...)
|
||||||
|
result.Add(s2.Members()...)
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// Intersection returns a new set that contains the common elements between the current set and another set.
|
||||||
|
//
|
||||||
|
// Parameter:
|
||||||
|
// - s2: the other set to compare with.
|
||||||
|
// Return:
|
||||||
|
// - Set[E]: a new set that contains the common elements.
|
||||||
|
func (s Set[E]) Intersection(s2 Set[E]) Set[E] {
|
||||||
|
result := MakeSet[E]()
|
||||||
|
for _, v := range s.Members() {
|
||||||
|
if s2.Contains(v) {
|
||||||
|
result.Add(v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
Reference in New Issue
Block a user