From 2a11adb346ef579054955e8c3443c73d5ed66c04 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 25 Aug 2023 14:36:38 +0200 Subject: [PATCH] Add some doc and switch to the parallel gzip library Former-commit-id: 2c1187001f989ba3de5895f516d4c8b54d52a4c4 --- pkg/obialign/dnamatrix.go | 28 +++++++++ pkg/obialign/fastlcsegf.go | 60 +++++++++++++++++++ pkg/obiformats/ecopcr_read.go | 2 +- pkg/obiformats/universal_read.go | 4 +- pkg/obiiter/workers.go | 3 +- pkg/obiutils/array.go | 40 +++++++++++++ pkg/obiutils/gzipfile.go | 2 +- pkg/obiutils/set.go | 99 ++++++++++++++++++++++++++++++++ 8 files changed, 233 insertions(+), 5 deletions(-) create mode 100644 pkg/obiutils/array.go create mode 100644 pkg/obiutils/set.go diff --git a/pkg/obialign/dnamatrix.go b/pkg/obialign/dnamatrix.go index 6d8cac0..fdb982c 100644 --- a/pkg/obialign/dnamatrix.go +++ b/pkg/obialign/dnamatrix.go @@ -29,20 +29,39 @@ var _NucPartMatch [32][32]float64 var _NucScorePartMatchMatch [100][100]int var _NucScorePartMatchMismatch [100][100]int +// _MatchRatio calculates the match ratio between two bytes. +// +// It takes two parameters, a and b, which are bytes to be compared. +// The function returns a float64 value representing the match ratio. func _MatchRatio(a, b byte) float64 { // count of common bits cm := _FourBitsCount[a&b&15] + // count of bits in a ca := _FourBitsCount[a&15] + + // count of bits in b cb := _FourBitsCount[b&15] + // check if any of the counts is zero if cm == 0 || ca == 0 || cb == 0 { return float64(0) } + // calculate the match ratio return float64(cm) / float64(ca) / float64(cb) } +// _Logaddexp calculates the logarithm of the sum of exponentials of two given numbers. +// +// Parameters: +// +// a - the first number (float64) +// b - the second number (float64) +// +// Returns: +// +// float64 - the result of the calculation func _Logaddexp(a, b float64) float64 { if a > b { a, b = b, a @@ -51,6 +70,15 @@ func _Logaddexp(a, b float64) float64 { return b + math.Log1p(math.Exp(a-b)) } +// _MatchScoreRatio calculates the match score ratio between two bytes. +// +// Parameters: +// - a: the first byte +// - b: the second byte +// +// Returns: +// - float64: the match score ratio when a match is observed +// - float64: the match score ratio when a mismatch is observed func _MatchScoreRatio(a, b byte) (float64, float64) { l2 := math.Log(2) diff --git a/pkg/obialign/fastlcsegf.go b/pkg/obialign/fastlcsegf.go index 943a2f6..586e799 100644 --- a/pkg/obialign/fastlcsegf.go +++ b/pkg/obialign/fastlcsegf.go @@ -31,6 +31,28 @@ func _samenuc(a, b byte) bool { } return a == b } + +// FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices. +// +// The score is calculated using the following scoring matrix: +// - Match : +1 +// - Mismatch and gap: 0 +// +// The LCS is calculated using the Needleman-Wunsch algorithm. +// At the same time the length of the shortest path between the two sequences is calculated. +// If the endgapfree flag is set to true, the returned length does not include the end gaps. +// If the number of mismatches or gaps is larger than the maximum allowed error, -1 is returned for both. +// +// Parameters: +// - bA: The first byte slice. +// - bB: The second byte slice. +// - maxError: The maximum allowed error. If set to -1, no limit is applied. +// - endgapfree: A boolean flag indicating whether the LCS should be end-gap free. +// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created. +// +// Returns: +// - The score of the LCS. +// - The length of the LCS. func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int) { lA := len(bA) @@ -316,10 +338,48 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[ return s, l } +// FastLCSEGFScore calculates the score of the longest common subsequence between two bio sequences in end-gap-free mode. +// +// if maxError > 0, the maximum allowed error between the sequences is maxError. +// Otherwise, no error checking is done. +// If the actual number of errors is larger than maxError, -1 is returned for both values. +// +// The score matrix is: +// - Matching: 1 +// - Mismatch or gap: 0 +// +// Compared to FastLCSScoreByte the length of the shortest alignment returned does not include the end-gaps. +// +// if buffer != nil, the buffer is used to store intermediate results. +// Otherwise, a new buffer is allocated. +// +// seqA: The first bio sequence. +// seqB: The second bio sequence. +// maxError: The maximum allowed error between the sequences. +// buffer: A buffer to store intermediate results. +// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding. func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer) } +// FastLCSScore calculates the score of the longest common subsequence between two bio sequences. +// +// if maxError > 0, the maximum allowed error between the sequences is maxError. +// Otherwise, no error checking is done. +// If the actual number of errors is larger than maxError, -1 is returned for both values. +// +// The score matrix is: +// - Matching: 1 +// - Mismatch or gap: 0 +// +// if buffer != nil, the buffer is used to store intermediate results. +// Otherwise, a new buffer is allocated. +// +// seqA: The first bio sequence. +// seqB: The second bio sequence. +// maxError: The maximum allowed error between the sequences. +// buffer: A buffer to store intermediate results. +// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding. func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer) } diff --git a/pkg/obiformats/ecopcr_read.go b/pkg/obiformats/ecopcr_read.go index 96b68b7..1dae062 100644 --- a/pkg/obiformats/ecopcr_read.go +++ b/pkg/obiformats/ecopcr_read.go @@ -1,9 +1,9 @@ package obiformats import ( - "compress/gzip" "encoding/csv" "fmt" + gzip "github.com/klauspost/pgzip" "io" "os" "path" diff --git a/pkg/obiformats/universal_read.go b/pkg/obiformats/universal_read.go index 5143303..f091ecb 100644 --- a/pkg/obiformats/universal_read.go +++ b/pkg/obiformats/universal_read.go @@ -2,7 +2,7 @@ package obiformats import ( "bufio" - "compress/gzip" + gzip "github.com/klauspost/pgzip" "io" "os" "path" @@ -52,7 +52,7 @@ func ReadSequencesFromFile(filename string, var err error options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename))))) - + file, err = os.Open(filename) if err != nil { diff --git a/pkg/obiiter/workers.go b/pkg/obiiter/workers.go index b7adc6e..469895b 100644 --- a/pkg/obiiter/workers.go +++ b/pkg/obiiter/workers.go @@ -3,6 +3,7 @@ package obiiter import ( log "github.com/sirupsen/logrus" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" ) @@ -15,7 +16,7 @@ import ( // - First is allowing to indicates the number of workers running in parallele (default 4) // - The second the size of the chanel buffer. By default set to the same value than the input buffer. func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int) IBioSequence { - nworkers := 4 + nworkers := obioptions.CLIParallelWorkers() if len(sizes) > 0 { nworkers = sizes[0] diff --git a/pkg/obiutils/array.go b/pkg/obiutils/array.go new file mode 100644 index 0000000..0fa4346 --- /dev/null +++ b/pkg/obiutils/array.go @@ -0,0 +1,40 @@ +package obiutils + +// Matrix is a generic type representing a matrix. +type Matrix[T any] [][]T + +// Make2DArray generates a 2D array of type T with the specified number of rows and columns. +// +// Data is stored in a contiguous memory block in row-major order. +// +// Parameters: +// - rows: the number of rows in the 2D array. +// - cols: the number of columns in the 2D array. +// +// Return: +// - Matrix[T]: the generated 2D array of type T. +func Make2DArray[T any](rows, cols int) Matrix[T] { + matrix := make(Matrix[T], rows) + data := make([]T, cols*rows) + for i := 0; i < rows; i++ { + matrix[i] = data[i*cols : (i+1)*cols] + } + return matrix +} + +// Row returns the i-th row of the matrix. +// +// Parameters: +// +// i - the index of the row to retrieve. +// +// Return: +// +// []T - the i-th row of the matrix. +func (matrix *Matrix[T]) Column(i int) []T { + r := make([]T, len(*matrix)) + for j := 0; j < len(*matrix); j++ { + r[j] = (*matrix)[j][i] + } + return r +} diff --git a/pkg/obiutils/gzipfile.go b/pkg/obiutils/gzipfile.go index b535b4c..ed2c74e 100644 --- a/pkg/obiutils/gzipfile.go +++ b/pkg/obiutils/gzipfile.go @@ -2,7 +2,7 @@ package obiutils import ( "bufio" - "compress/gzip" + gzip "github.com/klauspost/pgzip" "io" "os" ) diff --git a/pkg/obiutils/set.go b/pkg/obiutils/set.go new file mode 100644 index 0000000..4a9ef52 --- /dev/null +++ b/pkg/obiutils/set.go @@ -0,0 +1,99 @@ +package obiutils + +import "fmt" + +type Set[E comparable] map[E]struct{} + +// MakeSet creates a new Set with the provided values. +// +// The function takes a variadic parameter `vals` of type `E` which represents the values +// that will be added to the Set. +// +// The function returns a Set of type `Set[E]` which contains the provided values. +func MakeSet[E comparable](vals ...E) Set[E] { + s := Set[E]{} + for _, v := range vals { + s[v] = struct{}{} + } + return s +} + +// NewSet creates a new Set with the given values. +// +// It takes a variadic parameter of type E, where E is a comparable type. +// It returns a pointer to a Set of type E. +func NewSet[E comparable](vals ...E) *Set[E] { + s := MakeSet[E](vals...) + return &s +} + +// Add adds the given values to the set. +// +// It takes a variadic parameter `vals` of type `E`. +// There is no return type for this function. +func (s Set[E]) Add(vals ...E) { + for _, v := range vals { + s[v] = struct{}{} + } +} + +// Contains checks if the set contains a given element. +// +// Parameters: +// - v: the element to check for presence in the set. +// +// Returns: +// - bool: true if the set contains the given element, false otherwise. +func (s Set[E]) Contains(v E) bool { + _, ok := s[v] + return ok +} + +// Members returns a slice of all the elements in the set. +// +// It does not modify the original set. +// It returns a slice of type []E. +func (s Set[E]) Members() []E { + result := make([]E, 0, len(s)) + for v := range s { + result = append(result, v) + } + return result +} + +// String returns a string representation of the set. +// +// It returns a string representation of the set by formatting the set's members using the fmt.Sprintf function. +// The resulting string is then returned. +func (s Set[E]) String() string { + return fmt.Sprintf("%v", s.Members()) +} + +// Union returns a new set that is the union of the current set and the specified set. +// +// Parameters: +// - s2: the set to be unioned with the current set. +// +// Return: +// - Set[E]: the resulting set after the union operation. +func (s Set[E]) Union(s2 Set[E]) Set[E] { + result := MakeSet(s.Members()...) + result.Add(s2.Members()...) + return result +} + +// Intersection returns a new set that contains the common elements between the current set and another set. +// +// Parameter: +// - s2: the other set to compare with. +// Return: +// - Set[E]: a new set that contains the common elements. +func (s Set[E]) Intersection(s2 Set[E]) Set[E] { + result := MakeSet[E]() + for _, v := range s.Members() { + if s2.Contains(v) { + result.Add(v) + } + } + return result +}