Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup

Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup

This commit introduces Jaccard distance and similarity methods for KmerSet and KmerSetGroup.

For KmerSet:
- Added JaccardDistance method to compute the Jaccard distance between two KmerSets
- Added JaccardSimilarity method to compute the Jaccard similarity between two KmerSets

For KmerSetGroup:
- Added JaccardDistanceMatrix method to compute a pairwise Jaccard distance matrix
- Added JaccardSimilarityMatrix method to compute a pairwise Jaccard similarity matrix

Also includes:
- New DistMatrix implementation in pkg/obidist for storing and computing distance/similarity matrices
- Updated version handling with bump-version target in Makefile
- Added tests for all new methods
This commit is contained in:
Eric Coissac
2026-02-05 17:38:47 +01:00
parent aa2e94dd6f
commit e3c41fc11b
9 changed files with 1328 additions and 28 deletions

View File

@@ -0,0 +1,272 @@
package obikmer
import (
"math"
"testing"
)
func TestJaccardDistanceIdentical(t *testing.T) {
ks1 := NewKmerSet(5)
ks1.AddKmerCode(100)
ks1.AddKmerCode(200)
ks1.AddKmerCode(300)
ks2 := NewKmerSet(5)
ks2.AddKmerCode(100)
ks2.AddKmerCode(200)
ks2.AddKmerCode(300)
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
if distance != 0.0 {
t.Errorf("Expected distance 0.0 for identical sets, got %f", distance)
}
if similarity != 1.0 {
t.Errorf("Expected similarity 1.0 for identical sets, got %f", similarity)
}
}
func TestJaccardDistanceDisjoint(t *testing.T) {
ks1 := NewKmerSet(5)
ks1.AddKmerCode(100)
ks1.AddKmerCode(200)
ks1.AddKmerCode(300)
ks2 := NewKmerSet(5)
ks2.AddKmerCode(400)
ks2.AddKmerCode(500)
ks2.AddKmerCode(600)
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
if distance != 1.0 {
t.Errorf("Expected distance 1.0 for disjoint sets, got %f", distance)
}
if similarity != 0.0 {
t.Errorf("Expected similarity 0.0 for disjoint sets, got %f", similarity)
}
}
func TestJaccardDistancePartialOverlap(t *testing.T) {
// Set 1: {1, 2, 3}
ks1 := NewKmerSet(5)
ks1.AddKmerCode(1)
ks1.AddKmerCode(2)
ks1.AddKmerCode(3)
// Set 2: {2, 3, 4}
ks2 := NewKmerSet(5)
ks2.AddKmerCode(2)
ks2.AddKmerCode(3)
ks2.AddKmerCode(4)
// Intersection: {2, 3} -> cardinality = 2
// Union: {1, 2, 3, 4} -> cardinality = 4
// Similarity = 2/4 = 0.5
// Distance = 1 - 0.5 = 0.5
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
expectedDistance := 0.5
expectedSimilarity := 0.5
if math.Abs(distance-expectedDistance) > 1e-10 {
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
}
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
}
}
func TestJaccardDistanceOneSubsetOfOther(t *testing.T) {
// Set 1: {1, 2}
ks1 := NewKmerSet(5)
ks1.AddKmerCode(1)
ks1.AddKmerCode(2)
// Set 2: {1, 2, 3, 4}
ks2 := NewKmerSet(5)
ks2.AddKmerCode(1)
ks2.AddKmerCode(2)
ks2.AddKmerCode(3)
ks2.AddKmerCode(4)
// Intersection: {1, 2} -> cardinality = 2
// Union: {1, 2, 3, 4} -> cardinality = 4
// Similarity = 2/4 = 0.5
// Distance = 1 - 0.5 = 0.5
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
expectedDistance := 0.5
expectedSimilarity := 0.5
if math.Abs(distance-expectedDistance) > 1e-10 {
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
}
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
}
}
func TestJaccardDistanceEmptySets(t *testing.T) {
ks1 := NewKmerSet(5)
ks2 := NewKmerSet(5)
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
// By convention, distance = 1.0 for empty sets
if distance != 1.0 {
t.Errorf("Expected distance 1.0 for empty sets, got %f", distance)
}
if similarity != 0.0 {
t.Errorf("Expected similarity 0.0 for empty sets, got %f", similarity)
}
}
func TestJaccardDistanceOneEmpty(t *testing.T) {
ks1 := NewKmerSet(5)
ks1.AddKmerCode(1)
ks1.AddKmerCode(2)
ks1.AddKmerCode(3)
ks2 := NewKmerSet(5)
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
// Intersection: {} -> cardinality = 0
// Union: {1, 2, 3} -> cardinality = 3
// Similarity = 0/3 = 0.0
// Distance = 1.0
if distance != 1.0 {
t.Errorf("Expected distance 1.0 when one set is empty, got %f", distance)
}
if similarity != 0.0 {
t.Errorf("Expected similarity 0.0 when one set is empty, got %f", similarity)
}
}
func TestJaccardDistanceDifferentK(t *testing.T) {
ks1 := NewKmerSet(5)
ks1.AddKmerCode(1)
ks2 := NewKmerSet(7)
ks2.AddKmerCode(1)
defer func() {
if r := recover(); r == nil {
t.Errorf("Expected panic when computing Jaccard distance with different k values")
}
}()
_ = ks1.JaccardDistance(ks2)
}
func TestJaccardDistanceSimilarityRelation(t *testing.T) {
// Test that distance + similarity = 1.0 for all cases
testCases := []struct {
name string
ks1 *KmerSet
ks2 *KmerSet
}{
{
name: "partial overlap",
ks1: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(1)
ks.AddKmerCode(2)
ks.AddKmerCode(3)
return ks
}(),
ks2: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(2)
ks.AddKmerCode(3)
ks.AddKmerCode(4)
ks.AddKmerCode(5)
return ks
}(),
},
{
name: "identical",
ks1: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(10)
ks.AddKmerCode(20)
return ks
}(),
ks2: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(10)
ks.AddKmerCode(20)
return ks
}(),
},
{
name: "disjoint",
ks1: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(1)
return ks
}(),
ks2: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(100)
return ks
}(),
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
distance := tc.ks1.JaccardDistance(tc.ks2)
similarity := tc.ks1.JaccardSimilarity(tc.ks2)
sum := distance + similarity
if math.Abs(sum-1.0) > 1e-10 {
t.Errorf("Expected distance + similarity = 1.0, got %f + %f = %f",
distance, similarity, sum)
}
})
}
}
func TestJaccardDistanceSymmetry(t *testing.T) {
ks1 := NewKmerSet(5)
ks1.AddKmerCode(1)
ks1.AddKmerCode(2)
ks1.AddKmerCode(3)
ks2 := NewKmerSet(5)
ks2.AddKmerCode(2)
ks2.AddKmerCode(3)
ks2.AddKmerCode(4)
distance1 := ks1.JaccardDistance(ks2)
distance2 := ks2.JaccardDistance(ks1)
similarity1 := ks1.JaccardSimilarity(ks2)
similarity2 := ks2.JaccardSimilarity(ks1)
if math.Abs(distance1-distance2) > 1e-10 {
t.Errorf("Jaccard distance not symmetric: %f vs %f", distance1, distance2)
}
if math.Abs(similarity1-similarity2) > 1e-10 {
t.Errorf("Jaccard similarity not symmetric: %f vs %f", similarity1, similarity2)
}
}