Add some doc and switch to the parallel gzip library

Former-commit-id: 2c1187001f989ba3de5895f516d4c8b54d52a4c4
This commit is contained in:
2023-08-25 14:36:38 +02:00
parent 8a98210103
commit 2a11adb346
8 changed files with 233 additions and 5 deletions

View File

@ -29,20 +29,39 @@ var _NucPartMatch [32][32]float64
var _NucScorePartMatchMatch [100][100]int
var _NucScorePartMatchMismatch [100][100]int
// _MatchRatio calculates the match ratio between two bytes.
//
// It takes two parameters, a and b, which are bytes to be compared.
// The function returns a float64 value representing the match ratio.
func _MatchRatio(a, b byte) float64 {
// count of common bits
cm := _FourBitsCount[a&b&15]
// count of bits in a
ca := _FourBitsCount[a&15]
// count of bits in b
cb := _FourBitsCount[b&15]
// check if any of the counts is zero
if cm == 0 || ca == 0 || cb == 0 {
return float64(0)
}
// calculate the match ratio
return float64(cm) / float64(ca) / float64(cb)
}
// _Logaddexp calculates the logarithm of the sum of exponentials of two given numbers.
//
// Parameters:
//
// a - the first number (float64)
// b - the second number (float64)
//
// Returns:
//
// float64 - the result of the calculation
func _Logaddexp(a, b float64) float64 {
if a > b {
a, b = b, a
@ -51,6 +70,15 @@ func _Logaddexp(a, b float64) float64 {
return b + math.Log1p(math.Exp(a-b))
}
// _MatchScoreRatio calculates the match score ratio between two bytes.
//
// Parameters:
// - a: the first byte
// - b: the second byte
//
// Returns:
// - float64: the match score ratio when a match is observed
// - float64: the match score ratio when a mismatch is observed
func _MatchScoreRatio(a, b byte) (float64, float64) {
l2 := math.Log(2)

View File

@ -31,6 +31,28 @@ func _samenuc(a, b byte) bool {
}
return a == b
}
// FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices.
//
// The score is calculated using the following scoring matrix:
// - Match : +1
// - Mismatch and gap: 0
//
// The LCS is calculated using the Needleman-Wunsch algorithm.
// At the same time the length of the shortest path between the two sequences is calculated.
// If the endgapfree flag is set to true, the returned length does not include the end gaps.
// If the number of mismatches or gaps is larger than the maximum allowed error, -1 is returned for both.
//
// Parameters:
// - bA: The first byte slice.
// - bB: The second byte slice.
// - maxError: The maximum allowed error. If set to -1, no limit is applied.
// - endgapfree: A boolean flag indicating whether the LCS should be end-gap free.
// - buffer: A pointer to a uint64 slice to store intermediate results. If nil, a new slice is created.
//
// Returns:
// - The score of the LCS.
// - The length of the LCS.
func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[]uint64) (int, int) {
lA := len(bA)
@ -316,10 +338,48 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
return s, l
}
// FastLCSEGFScore calculates the score of the longest common subsequence between two bio sequences in end-gap-free mode.
//
// if maxError > 0, the maximum allowed error between the sequences is maxError.
// Otherwise, no error checking is done.
// If the actual number of errors is larger than maxError, -1 is returned for both values.
//
// The score matrix is:
// - Matching: 1
// - Mismatch or gap: 0
//
// Compared to FastLCSScoreByte the length of the shortest alignment returned does not include the end-gaps.
//
// if buffer != nil, the buffer is used to store intermediate results.
// Otherwise, a new buffer is allocated.
//
// seqA: The first bio sequence.
// seqB: The second bio sequence.
// maxError: The maximum allowed error between the sequences.
// buffer: A buffer to store intermediate results.
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, true, buffer)
}
// FastLCSScore calculates the score of the longest common subsequence between two bio sequences.
//
// if maxError > 0, the maximum allowed error between the sequences is maxError.
// Otherwise, no error checking is done.
// If the actual number of errors is larger than maxError, -1 is returned for both values.
//
// The score matrix is:
// - Matching: 1
// - Mismatch or gap: 0
//
// if buffer != nil, the buffer is used to store intermediate results.
// Otherwise, a new buffer is allocated.
//
// seqA: The first bio sequence.
// seqB: The second bio sequence.
// maxError: The maximum allowed error between the sequences.
// buffer: A buffer to store intermediate results.
// Returns the score of the longest common subsequence and the length of the shortest alignment corresponding.
func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) {
return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer)
}