Add some doc and switch to the parallel gzip library

Former-commit-id: 2c1187001f989ba3de5895f516d4c8b54d52a4c4
2025-06-29 16:20:46 +00:00 · 2023-08-25 14:36:38 +02:00
parent 8a98210103
commit 2a11adb346
8 changed files with 233 additions and 5 deletions
--- a/pkg/obialign/dnamatrix.go
+++ b/pkg/obialign/dnamatrix.go
@ -29,20 +29,39 @@ var _NucPartMatch [32][32]float64
 var _NucScorePartMatchMatch [100][100]int
 var _NucScorePartMatchMismatch [100][100]int

+// _MatchRatio calculates the match ratio between two bytes.
+//
+// It takes two parameters, a and b, which are bytes to be compared.
+// The function returns a float64 value representing the match ratio.
 func _MatchRatio(a, b byte) float64 {
 	// count of common bits
 	cm := _FourBitsCount[a&b&15]

+	// count of bits in a
 	ca := _FourBitsCount[a&15]
+
+	// count of bits in b
 	cb := _FourBitsCount[b&15]

+	// check if any of the counts is zero
 	if cm == 0 || ca == 0 || cb == 0 {
 		return float64(0)
 	}

+	// calculate the match ratio
 	return float64(cm) / float64(ca) / float64(cb)
 }

+// _Logaddexp calculates the logarithm of the sum of exponentials of two given numbers.
+//
+// Parameters:
+//
+//	a - the first number (float64)
+//	b - the second number (float64)
+//
+// Returns:
+//
+//	float64 - the result of the calculation
 func _Logaddexp(a, b float64) float64 {
 	if a > b {
 		a, b = b, a
@ -51,6 +70,15 @@ func _Logaddexp(a, b float64) float64 {
 	return b + math.Log1p(math.Exp(a-b))
 }

+// _MatchScoreRatio calculates the match score ratio between two bytes.
+//
+// Parameters:
+// - a: the first byte
+// - b: the second byte
+//
+// Returns:
+// - float64: the match score ratio when a match is observed
+// - float64: the match score ratio when a mismatch is observed
 func _MatchScoreRatio(a, b byte) (float64, float64) {

 	l2 := math.Log(2)
--- a/pkg/obialign/fastlcsegf.go
+++ b/pkg/obialign/fastlcsegf.go
@ -31,6 +31,28 @@ func _samenuc(a, b byte) bool {
 	}
 	return a == b
 }
+
+// FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices.
+//
+// The score is calculated using the following scoring matrix:
+//   - Match : +1
+//   - Mismatch and gap: 0
+//
+// The LCS is calculated using the Needleman-Wunsch algorithm.
+// At the same time the length of the shortest path between the two sequences is calculated.
+// If the endgapfree flag is set to true, the returned length does not include the end gaps.
+// If the number of mismatches or gaps is larger than the maximum allowed error, -1 is returned for both.
+//
+// Parameters:
+// - bA: The first byte slice.
+// - bB: The second byte slice.
+// - maxError: The maximum allowed error. If set to -1, no limit is applied.
+// - endgapfree: A boolean flag indicating whether the LCS should be end-gap free.
+// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created.
+//
+// Returns:
+// - The score of the LCS.
+// - The length of the LCS.
 func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int) {

 	lA := len(bA)
@ -316,10 +338,48 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
 	return s, l
 }

+// FastLCSEGFScore calculates the score of the longest common subsequence between two bio sequences in end-gap-free mode.
+//
+// if maxError > 0, the maximum allowed error between the sequences is maxError.
+// Otherwise, no error checking is done.
+// If the actual number of errors is larger than maxError, -1 is returned for both values.
+//
+// The score matrix is:
+//   - Matching: 1
+//   - Mismatch or gap: 0
+//
+// Compared to FastLCSScoreByte the length of the shortest alignment returned does not include the end-gaps.
+//
+// if buffer != nil, the buffer is used to store intermediate results.
+// Otherwise, a new buffer is allocated.
+//
+// seqA: The first bio sequence.
+// seqB: The second bio sequence.
+// maxError: The maximum allowed error between the sequences.
+// buffer: A buffer to store intermediate results.
+// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
 func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
 	return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer)
 }

+// FastLCSScore calculates the score of the longest common subsequence between two bio sequences.
+//
+// if maxError > 0, the maximum allowed error between the sequences is maxError.
+// Otherwise, no error checking is done.
+// If the actual number of errors is larger than maxError, -1 is returned for both values.
+//
+// The score matrix is:
+//   - Matching: 1
+//   - Mismatch or gap: 0
+//
+// if buffer != nil, the buffer is used to store intermediate results.
+// Otherwise, a new buffer is allocated.
+//
+// seqA: The first bio sequence.
+// seqB: The second bio sequence.
+// maxError: The maximum allowed error between the sequences.
+// buffer: A buffer to store intermediate results.
+// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
 func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
 	return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer)
 }