mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 05:20:52 +00:00
Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup This commit introduces Jaccard distance and similarity methods for KmerSet and KmerSetGroup. For KmerSet: - Added JaccardDistance method to compute the Jaccard distance between two KmerSets - Added JaccardSimilarity method to compute the Jaccard similarity between two KmerSets For KmerSetGroup: - Added JaccardDistanceMatrix method to compute a pairwise Jaccard distance matrix - Added JaccardSimilarityMatrix method to compute a pairwise Jaccard similarity matrix Also includes: - New DistMatrix implementation in pkg/obidist for storing and computing distance/similarity matrices - Updated version handling with bump-version target in Makefile - Added tests for all new methods
273 lines
5.9 KiB
Go
273 lines
5.9 KiB
Go
package obikmer
|
|
|
|
import (
|
|
"math"
|
|
"testing"
|
|
)
|
|
|
|
func TestJaccardDistanceIdentical(t *testing.T) {
|
|
ks1 := NewKmerSet(5)
|
|
ks1.AddKmerCode(100)
|
|
ks1.AddKmerCode(200)
|
|
ks1.AddKmerCode(300)
|
|
|
|
ks2 := NewKmerSet(5)
|
|
ks2.AddKmerCode(100)
|
|
ks2.AddKmerCode(200)
|
|
ks2.AddKmerCode(300)
|
|
|
|
distance := ks1.JaccardDistance(ks2)
|
|
similarity := ks1.JaccardSimilarity(ks2)
|
|
|
|
if distance != 0.0 {
|
|
t.Errorf("Expected distance 0.0 for identical sets, got %f", distance)
|
|
}
|
|
|
|
if similarity != 1.0 {
|
|
t.Errorf("Expected similarity 1.0 for identical sets, got %f", similarity)
|
|
}
|
|
}
|
|
|
|
func TestJaccardDistanceDisjoint(t *testing.T) {
|
|
ks1 := NewKmerSet(5)
|
|
ks1.AddKmerCode(100)
|
|
ks1.AddKmerCode(200)
|
|
ks1.AddKmerCode(300)
|
|
|
|
ks2 := NewKmerSet(5)
|
|
ks2.AddKmerCode(400)
|
|
ks2.AddKmerCode(500)
|
|
ks2.AddKmerCode(600)
|
|
|
|
distance := ks1.JaccardDistance(ks2)
|
|
similarity := ks1.JaccardSimilarity(ks2)
|
|
|
|
if distance != 1.0 {
|
|
t.Errorf("Expected distance 1.0 for disjoint sets, got %f", distance)
|
|
}
|
|
|
|
if similarity != 0.0 {
|
|
t.Errorf("Expected similarity 0.0 for disjoint sets, got %f", similarity)
|
|
}
|
|
}
|
|
|
|
func TestJaccardDistancePartialOverlap(t *testing.T) {
|
|
// Set 1: {1, 2, 3}
|
|
ks1 := NewKmerSet(5)
|
|
ks1.AddKmerCode(1)
|
|
ks1.AddKmerCode(2)
|
|
ks1.AddKmerCode(3)
|
|
|
|
// Set 2: {2, 3, 4}
|
|
ks2 := NewKmerSet(5)
|
|
ks2.AddKmerCode(2)
|
|
ks2.AddKmerCode(3)
|
|
ks2.AddKmerCode(4)
|
|
|
|
// Intersection: {2, 3} -> cardinality = 2
|
|
// Union: {1, 2, 3, 4} -> cardinality = 4
|
|
// Similarity = 2/4 = 0.5
|
|
// Distance = 1 - 0.5 = 0.5
|
|
|
|
distance := ks1.JaccardDistance(ks2)
|
|
similarity := ks1.JaccardSimilarity(ks2)
|
|
|
|
expectedDistance := 0.5
|
|
expectedSimilarity := 0.5
|
|
|
|
if math.Abs(distance-expectedDistance) > 1e-10 {
|
|
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
|
|
}
|
|
|
|
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
|
|
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
|
|
}
|
|
}
|
|
|
|
func TestJaccardDistanceOneSubsetOfOther(t *testing.T) {
|
|
// Set 1: {1, 2}
|
|
ks1 := NewKmerSet(5)
|
|
ks1.AddKmerCode(1)
|
|
ks1.AddKmerCode(2)
|
|
|
|
// Set 2: {1, 2, 3, 4}
|
|
ks2 := NewKmerSet(5)
|
|
ks2.AddKmerCode(1)
|
|
ks2.AddKmerCode(2)
|
|
ks2.AddKmerCode(3)
|
|
ks2.AddKmerCode(4)
|
|
|
|
// Intersection: {1, 2} -> cardinality = 2
|
|
// Union: {1, 2, 3, 4} -> cardinality = 4
|
|
// Similarity = 2/4 = 0.5
|
|
// Distance = 1 - 0.5 = 0.5
|
|
|
|
distance := ks1.JaccardDistance(ks2)
|
|
similarity := ks1.JaccardSimilarity(ks2)
|
|
|
|
expectedDistance := 0.5
|
|
expectedSimilarity := 0.5
|
|
|
|
if math.Abs(distance-expectedDistance) > 1e-10 {
|
|
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
|
|
}
|
|
|
|
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
|
|
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
|
|
}
|
|
}
|
|
|
|
func TestJaccardDistanceEmptySets(t *testing.T) {
|
|
ks1 := NewKmerSet(5)
|
|
ks2 := NewKmerSet(5)
|
|
|
|
distance := ks1.JaccardDistance(ks2)
|
|
similarity := ks1.JaccardSimilarity(ks2)
|
|
|
|
// By convention, distance = 1.0 for empty sets
|
|
if distance != 1.0 {
|
|
t.Errorf("Expected distance 1.0 for empty sets, got %f", distance)
|
|
}
|
|
|
|
if similarity != 0.0 {
|
|
t.Errorf("Expected similarity 0.0 for empty sets, got %f", similarity)
|
|
}
|
|
}
|
|
|
|
func TestJaccardDistanceOneEmpty(t *testing.T) {
|
|
ks1 := NewKmerSet(5)
|
|
ks1.AddKmerCode(1)
|
|
ks1.AddKmerCode(2)
|
|
ks1.AddKmerCode(3)
|
|
|
|
ks2 := NewKmerSet(5)
|
|
|
|
distance := ks1.JaccardDistance(ks2)
|
|
similarity := ks1.JaccardSimilarity(ks2)
|
|
|
|
// Intersection: {} -> cardinality = 0
|
|
// Union: {1, 2, 3} -> cardinality = 3
|
|
// Similarity = 0/3 = 0.0
|
|
// Distance = 1.0
|
|
|
|
if distance != 1.0 {
|
|
t.Errorf("Expected distance 1.0 when one set is empty, got %f", distance)
|
|
}
|
|
|
|
if similarity != 0.0 {
|
|
t.Errorf("Expected similarity 0.0 when one set is empty, got %f", similarity)
|
|
}
|
|
}
|
|
|
|
func TestJaccardDistanceDifferentK(t *testing.T) {
|
|
ks1 := NewKmerSet(5)
|
|
ks1.AddKmerCode(1)
|
|
|
|
ks2 := NewKmerSet(7)
|
|
ks2.AddKmerCode(1)
|
|
|
|
defer func() {
|
|
if r := recover(); r == nil {
|
|
t.Errorf("Expected panic when computing Jaccard distance with different k values")
|
|
}
|
|
}()
|
|
|
|
_ = ks1.JaccardDistance(ks2)
|
|
}
|
|
|
|
func TestJaccardDistanceSimilarityRelation(t *testing.T) {
|
|
// Test that distance + similarity = 1.0 for all cases
|
|
testCases := []struct {
|
|
name string
|
|
ks1 *KmerSet
|
|
ks2 *KmerSet
|
|
}{
|
|
{
|
|
name: "partial overlap",
|
|
ks1: func() *KmerSet {
|
|
ks := NewKmerSet(5)
|
|
ks.AddKmerCode(1)
|
|
ks.AddKmerCode(2)
|
|
ks.AddKmerCode(3)
|
|
return ks
|
|
}(),
|
|
ks2: func() *KmerSet {
|
|
ks := NewKmerSet(5)
|
|
ks.AddKmerCode(2)
|
|
ks.AddKmerCode(3)
|
|
ks.AddKmerCode(4)
|
|
ks.AddKmerCode(5)
|
|
return ks
|
|
}(),
|
|
},
|
|
{
|
|
name: "identical",
|
|
ks1: func() *KmerSet {
|
|
ks := NewKmerSet(5)
|
|
ks.AddKmerCode(10)
|
|
ks.AddKmerCode(20)
|
|
return ks
|
|
}(),
|
|
ks2: func() *KmerSet {
|
|
ks := NewKmerSet(5)
|
|
ks.AddKmerCode(10)
|
|
ks.AddKmerCode(20)
|
|
return ks
|
|
}(),
|
|
},
|
|
{
|
|
name: "disjoint",
|
|
ks1: func() *KmerSet {
|
|
ks := NewKmerSet(5)
|
|
ks.AddKmerCode(1)
|
|
return ks
|
|
}(),
|
|
ks2: func() *KmerSet {
|
|
ks := NewKmerSet(5)
|
|
ks.AddKmerCode(100)
|
|
return ks
|
|
}(),
|
|
},
|
|
}
|
|
|
|
for _, tc := range testCases {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
distance := tc.ks1.JaccardDistance(tc.ks2)
|
|
similarity := tc.ks1.JaccardSimilarity(tc.ks2)
|
|
|
|
sum := distance + similarity
|
|
|
|
if math.Abs(sum-1.0) > 1e-10 {
|
|
t.Errorf("Expected distance + similarity = 1.0, got %f + %f = %f",
|
|
distance, similarity, sum)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestJaccardDistanceSymmetry(t *testing.T) {
|
|
ks1 := NewKmerSet(5)
|
|
ks1.AddKmerCode(1)
|
|
ks1.AddKmerCode(2)
|
|
ks1.AddKmerCode(3)
|
|
|
|
ks2 := NewKmerSet(5)
|
|
ks2.AddKmerCode(2)
|
|
ks2.AddKmerCode(3)
|
|
ks2.AddKmerCode(4)
|
|
|
|
distance1 := ks1.JaccardDistance(ks2)
|
|
distance2 := ks2.JaccardDistance(ks1)
|
|
|
|
similarity1 := ks1.JaccardSimilarity(ks2)
|
|
similarity2 := ks2.JaccardSimilarity(ks1)
|
|
|
|
if math.Abs(distance1-distance2) > 1e-10 {
|
|
t.Errorf("Jaccard distance not symmetric: %f vs %f", distance1, distance2)
|
|
}
|
|
|
|
if math.Abs(similarity1-similarity2) > 1e-10 {
|
|
t.Errorf("Jaccard similarity not symmetric: %f vs %f", similarity1, similarity2)
|
|
}
|
|
}
|