mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup
Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup This commit introduces Jaccard distance and similarity methods for KmerSet and KmerSetGroup. For KmerSet: - Added JaccardDistance method to compute the Jaccard distance between two KmerSets - Added JaccardSimilarity method to compute the Jaccard similarity between two KmerSets For KmerSetGroup: - Added JaccardDistanceMatrix method to compute a pairwise Jaccard distance matrix - Added JaccardSimilarityMatrix method to compute a pairwise Jaccard similarity matrix Also includes: - New DistMatrix implementation in pkg/obidist for storing and computing distance/similarity matrices - Updated version handling with bump-version target in Makefile - Added tests for all new methods
This commit is contained in:
@@ -158,6 +158,54 @@ func (ks *KmerSet) Difference(other *KmerSet) *KmerSet {
|
||||
return NewKmerSetFromBitmap(ks.k, result)
|
||||
}
|
||||
|
||||
// JaccardDistance computes the Jaccard distance between two KmerSets.
|
||||
// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A ∪ B|)
|
||||
// where A and B are the two sets.
|
||||
//
|
||||
// Returns:
|
||||
// - 0.0 when sets are identical (distance = 0, similarity = 1)
|
||||
// - 1.0 when sets are completely disjoint (distance = 1, similarity = 0)
|
||||
// - 1.0 when both sets are empty (by convention)
|
||||
//
|
||||
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
|
||||
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
|
||||
func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 {
|
||||
if ks.k != other.k {
|
||||
panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k))
|
||||
}
|
||||
|
||||
// Compute intersection cardinality
|
||||
intersectionCard := ks.bitmap.AndCardinality(other.bitmap)
|
||||
|
||||
// Compute union cardinality
|
||||
unionCard := ks.bitmap.OrCardinality(other.bitmap)
|
||||
|
||||
// If union is empty, both sets are empty - return 1.0 by convention
|
||||
if unionCard == 0 {
|
||||
return 1.0
|
||||
}
|
||||
|
||||
// Jaccard similarity = |A ∩ B| / |A ∪ B|
|
||||
similarity := float64(intersectionCard) / float64(unionCard)
|
||||
|
||||
// Jaccard distance = 1 - similarity
|
||||
return 1.0 - similarity
|
||||
}
|
||||
|
||||
// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets.
|
||||
// The Jaccard similarity is defined as: |A ∩ B| / |A ∪ B|
|
||||
//
|
||||
// Returns:
|
||||
// - 1.0 when sets are identical (maximum similarity)
|
||||
// - 0.0 when sets are completely disjoint (no similarity)
|
||||
// - 0.0 when both sets are empty (by convention)
|
||||
//
|
||||
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
|
||||
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
|
||||
func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 {
|
||||
return 1.0 - ks.JaccardDistance(other)
|
||||
}
|
||||
|
||||
// Iterator returns an iterator over all k-mers in the set
|
||||
func (ks *KmerSet) Iterator() roaring64.IntIterable64 {
|
||||
return ks.bitmap.Iterator()
|
||||
|
||||
@@ -3,6 +3,7 @@ package obikmer
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
@@ -260,3 +261,79 @@ Set breakdown:
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix for all KmerSets in the group.
|
||||
// Returns a triangular distance matrix where element (i, j) represents the Jaccard distance
|
||||
// between set i and set j.
|
||||
//
|
||||
// The Jaccard distance is: 1 - (|A ∩ B| / |A ∪ B|)
|
||||
//
|
||||
// The matrix labels are set to the IDs of the individual KmerSets if available,
|
||||
// otherwise they are set to "set_0", "set_1", etc.
|
||||
//
|
||||
// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
|
||||
// Space complexity: O(n²) for the distance matrix
|
||||
func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix {
|
||||
n := len(ksg.sets)
|
||||
|
||||
// Create labels from set IDs
|
||||
labels := make([]string, n)
|
||||
for i, ks := range ksg.sets {
|
||||
if ks.Id() != "" {
|
||||
labels[i] = ks.Id()
|
||||
} else {
|
||||
labels[i] = fmt.Sprintf("set_%d", i)
|
||||
}
|
||||
}
|
||||
|
||||
dm := obidist.NewDistMatrixWithLabels(labels)
|
||||
|
||||
// Compute pairwise distances
|
||||
for i := 0; i < n-1; i++ {
|
||||
for j := i + 1; j < n; j++ {
|
||||
distance := ksg.sets[i].JaccardDistance(ksg.sets[j])
|
||||
dm.Set(i, j, distance)
|
||||
}
|
||||
}
|
||||
|
||||
return dm
|
||||
}
|
||||
|
||||
// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix for all KmerSets in the group.
|
||||
// Returns a similarity matrix where element (i, j) represents the Jaccard similarity
|
||||
// between set i and set j.
|
||||
//
|
||||
// The Jaccard similarity is: |A ∩ B| / |A ∪ B|
|
||||
//
|
||||
// The diagonal is 1.0 (similarity of a set to itself).
|
||||
//
|
||||
// The matrix labels are set to the IDs of the individual KmerSets if available,
|
||||
// otherwise they are set to "set_0", "set_1", etc.
|
||||
//
|
||||
// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
|
||||
// Space complexity: O(n²) for the similarity matrix
|
||||
func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
|
||||
n := len(ksg.sets)
|
||||
|
||||
// Create labels from set IDs
|
||||
labels := make([]string, n)
|
||||
for i, ks := range ksg.sets {
|
||||
if ks.Id() != "" {
|
||||
labels[i] = ks.Id()
|
||||
} else {
|
||||
labels[i] = fmt.Sprintf("set_%d", i)
|
||||
}
|
||||
}
|
||||
|
||||
sm := obidist.NewSimilarityMatrixWithLabels(labels)
|
||||
|
||||
// Compute pairwise similarities
|
||||
for i := 0; i < n-1; i++ {
|
||||
for j := i + 1; j < n; j++ {
|
||||
similarity := ksg.sets[i].JaccardSimilarity(ksg.sets[j])
|
||||
sm.Set(i, j, similarity)
|
||||
}
|
||||
}
|
||||
|
||||
return sm
|
||||
}
|
||||
|
||||
231
pkg/obikmer/kmer_set_group_jaccard_test.go
Normal file
231
pkg/obikmer/kmer_set_group_jaccard_test.go
Normal file
@@ -0,0 +1,231 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestKmerSetGroupJaccardDistanceMatrix(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Set 0: {1, 2, 3}
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
ksg.Get(0).AddKmerCode(3)
|
||||
ksg.Get(0).SetId("set_A")
|
||||
|
||||
// Set 1: {2, 3, 4}
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
ksg.Get(1).AddKmerCode(4)
|
||||
ksg.Get(1).SetId("set_B")
|
||||
|
||||
// Set 2: {5, 6, 7}
|
||||
ksg.Get(2).AddKmerCode(5)
|
||||
ksg.Get(2).AddKmerCode(6)
|
||||
ksg.Get(2).AddKmerCode(7)
|
||||
ksg.Get(2).SetId("set_C")
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
// Check labels
|
||||
if dm.GetLabel(0) != "set_A" {
|
||||
t.Errorf("Expected label 'set_A' at index 0, got '%s'", dm.GetLabel(0))
|
||||
}
|
||||
if dm.GetLabel(1) != "set_B" {
|
||||
t.Errorf("Expected label 'set_B' at index 1, got '%s'", dm.GetLabel(1))
|
||||
}
|
||||
if dm.GetLabel(2) != "set_C" {
|
||||
t.Errorf("Expected label 'set_C' at index 2, got '%s'", dm.GetLabel(2))
|
||||
}
|
||||
|
||||
// Check distances
|
||||
// Distance(0, 1):
|
||||
// Intersection: {2, 3} -> 2 elements
|
||||
// Union: {1, 2, 3, 4} -> 4 elements
|
||||
// Similarity: 2/4 = 0.5
|
||||
// Distance: 1 - 0.5 = 0.5
|
||||
expectedDist01 := 0.5
|
||||
actualDist01 := dm.Get(0, 1)
|
||||
if math.Abs(actualDist01-expectedDist01) > 1e-10 {
|
||||
t.Errorf("Distance(0, 1): expected %f, got %f", expectedDist01, actualDist01)
|
||||
}
|
||||
|
||||
// Distance(0, 2):
|
||||
// Intersection: {} -> 0 elements
|
||||
// Union: {1, 2, 3, 5, 6, 7} -> 6 elements
|
||||
// Similarity: 0/6 = 0
|
||||
// Distance: 1 - 0 = 1.0
|
||||
expectedDist02 := 1.0
|
||||
actualDist02 := dm.Get(0, 2)
|
||||
if math.Abs(actualDist02-expectedDist02) > 1e-10 {
|
||||
t.Errorf("Distance(0, 2): expected %f, got %f", expectedDist02, actualDist02)
|
||||
}
|
||||
|
||||
// Distance(1, 2):
|
||||
// Intersection: {} -> 0 elements
|
||||
// Union: {2, 3, 4, 5, 6, 7} -> 6 elements
|
||||
// Similarity: 0/6 = 0
|
||||
// Distance: 1 - 0 = 1.0
|
||||
expectedDist12 := 1.0
|
||||
actualDist12 := dm.Get(1, 2)
|
||||
if math.Abs(actualDist12-expectedDist12) > 1e-10 {
|
||||
t.Errorf("Distance(1, 2): expected %f, got %f", expectedDist12, actualDist12)
|
||||
}
|
||||
|
||||
// Check symmetry
|
||||
if dm.Get(0, 1) != dm.Get(1, 0) {
|
||||
t.Errorf("Matrix not symmetric: Get(0, 1) = %f, Get(1, 0) = %f",
|
||||
dm.Get(0, 1), dm.Get(1, 0))
|
||||
}
|
||||
|
||||
// Check diagonal
|
||||
if dm.Get(0, 0) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(0, 0))
|
||||
}
|
||||
if dm.Get(1, 1) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(1, 1))
|
||||
}
|
||||
if dm.Get(2, 2) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(2, 2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardSimilarityMatrix(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Set 0: {1, 2, 3}
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
ksg.Get(0).AddKmerCode(3)
|
||||
|
||||
// Set 1: {2, 3, 4}
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
ksg.Get(1).AddKmerCode(4)
|
||||
|
||||
// Set 2: {1, 2, 3} (same as set 0)
|
||||
ksg.Get(2).AddKmerCode(1)
|
||||
ksg.Get(2).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
sm := ksg.JaccardSimilarityMatrix()
|
||||
|
||||
// Check similarities
|
||||
// Similarity(0, 1): 0.5 (as calculated above)
|
||||
expectedSim01 := 0.5
|
||||
actualSim01 := sm.Get(0, 1)
|
||||
if math.Abs(actualSim01-expectedSim01) > 1e-10 {
|
||||
t.Errorf("Similarity(0, 1): expected %f, got %f", expectedSim01, actualSim01)
|
||||
}
|
||||
|
||||
// Similarity(0, 2): 1.0 (identical sets)
|
||||
expectedSim02 := 1.0
|
||||
actualSim02 := sm.Get(0, 2)
|
||||
if math.Abs(actualSim02-expectedSim02) > 1e-10 {
|
||||
t.Errorf("Similarity(0, 2): expected %f, got %f", expectedSim02, actualSim02)
|
||||
}
|
||||
|
||||
// Similarity(1, 2): 0.5
|
||||
// Intersection: {2, 3} -> 2
|
||||
// Union: {1, 2, 3, 4} -> 4
|
||||
// Similarity: 2/4 = 0.5
|
||||
expectedSim12 := 0.5
|
||||
actualSim12 := sm.Get(1, 2)
|
||||
if math.Abs(actualSim12-expectedSim12) > 1e-10 {
|
||||
t.Errorf("Similarity(1, 2): expected %f, got %f", expectedSim12, actualSim12)
|
||||
}
|
||||
|
||||
// Check diagonal (similarity to self = 1.0)
|
||||
if sm.Get(0, 0) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(0, 0))
|
||||
}
|
||||
if sm.Get(1, 1) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(1, 1))
|
||||
}
|
||||
if sm.Get(2, 2) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(2, 2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatricesRelation(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 4)
|
||||
|
||||
// Create different sets
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
|
||||
ksg.Get(2).AddKmerCode(1)
|
||||
ksg.Get(2).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
ksg.Get(3).AddKmerCode(10)
|
||||
ksg.Get(3).AddKmerCode(20)
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
sm := ksg.JaccardSimilarityMatrix()
|
||||
|
||||
// For all pairs (including diagonal), distance + similarity should equal 1.0
|
||||
for i := 0; i < 4; i++ {
|
||||
for j := 0; j < 4; j++ {
|
||||
distance := dm.Get(i, j)
|
||||
similarity := sm.Get(i, j)
|
||||
sum := distance + similarity
|
||||
|
||||
if math.Abs(sum-1.0) > 1e-10 {
|
||||
t.Errorf("At (%d, %d): distance %f + similarity %f = %f, expected 1.0",
|
||||
i, j, distance, similarity, sum)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatrixLabels(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Don't set IDs - should use default labels
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
// Check default labels
|
||||
if dm.GetLabel(0) != "set_0" {
|
||||
t.Errorf("Expected default label 'set_0', got '%s'", dm.GetLabel(0))
|
||||
}
|
||||
if dm.GetLabel(1) != "set_1" {
|
||||
t.Errorf("Expected default label 'set_1', got '%s'", dm.GetLabel(1))
|
||||
}
|
||||
if dm.GetLabel(2) != "set_2" {
|
||||
t.Errorf("Expected default label 'set_2', got '%s'", dm.GetLabel(2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatrixSize(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 5)
|
||||
|
||||
for i := 0; i < 5; i++ {
|
||||
ksg.Get(i).AddKmerCode(uint64(i))
|
||||
}
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
if dm.Size() != 5 {
|
||||
t.Errorf("Expected matrix size 5, got %d", dm.Size())
|
||||
}
|
||||
|
||||
// All sets are disjoint, so all distances should be 1.0
|
||||
for i := 0; i < 5; i++ {
|
||||
for j := i + 1; j < 5; j++ {
|
||||
dist := dm.Get(i, j)
|
||||
if math.Abs(dist-1.0) > 1e-10 {
|
||||
t.Errorf("Expected distance 1.0 for disjoint sets (%d, %d), got %f",
|
||||
i, j, dist)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
272
pkg/obikmer/kmer_set_test.go
Normal file
272
pkg/obikmer/kmer_set_test.go
Normal file
@@ -0,0 +1,272 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestJaccardDistanceIdentical(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(100)
|
||||
ks1.AddKmerCode(200)
|
||||
ks1.AddKmerCode(300)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(100)
|
||||
ks2.AddKmerCode(200)
|
||||
ks2.AddKmerCode(300)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
if distance != 0.0 {
|
||||
t.Errorf("Expected distance 0.0 for identical sets, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 1.0 {
|
||||
t.Errorf("Expected similarity 1.0 for identical sets, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceDisjoint(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(100)
|
||||
ks1.AddKmerCode(200)
|
||||
ks1.AddKmerCode(300)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(400)
|
||||
ks2.AddKmerCode(500)
|
||||
ks2.AddKmerCode(600)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
if distance != 1.0 {
|
||||
t.Errorf("Expected distance 1.0 for disjoint sets, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 0.0 {
|
||||
t.Errorf("Expected similarity 0.0 for disjoint sets, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistancePartialOverlap(t *testing.T) {
|
||||
// Set 1: {1, 2, 3}
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
ks1.AddKmerCode(3)
|
||||
|
||||
// Set 2: {2, 3, 4}
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(2)
|
||||
ks2.AddKmerCode(3)
|
||||
ks2.AddKmerCode(4)
|
||||
|
||||
// Intersection: {2, 3} -> cardinality = 2
|
||||
// Union: {1, 2, 3, 4} -> cardinality = 4
|
||||
// Similarity = 2/4 = 0.5
|
||||
// Distance = 1 - 0.5 = 0.5
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
expectedDistance := 0.5
|
||||
expectedSimilarity := 0.5
|
||||
|
||||
if math.Abs(distance-expectedDistance) > 1e-10 {
|
||||
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
|
||||
}
|
||||
|
||||
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
|
||||
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceOneSubsetOfOther(t *testing.T) {
|
||||
// Set 1: {1, 2}
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
|
||||
// Set 2: {1, 2, 3, 4}
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(1)
|
||||
ks2.AddKmerCode(2)
|
||||
ks2.AddKmerCode(3)
|
||||
ks2.AddKmerCode(4)
|
||||
|
||||
// Intersection: {1, 2} -> cardinality = 2
|
||||
// Union: {1, 2, 3, 4} -> cardinality = 4
|
||||
// Similarity = 2/4 = 0.5
|
||||
// Distance = 1 - 0.5 = 0.5
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
expectedDistance := 0.5
|
||||
expectedSimilarity := 0.5
|
||||
|
||||
if math.Abs(distance-expectedDistance) > 1e-10 {
|
||||
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
|
||||
}
|
||||
|
||||
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
|
||||
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceEmptySets(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks2 := NewKmerSet(5)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
// By convention, distance = 1.0 for empty sets
|
||||
if distance != 1.0 {
|
||||
t.Errorf("Expected distance 1.0 for empty sets, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 0.0 {
|
||||
t.Errorf("Expected similarity 0.0 for empty sets, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceOneEmpty(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
ks1.AddKmerCode(3)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
// Intersection: {} -> cardinality = 0
|
||||
// Union: {1, 2, 3} -> cardinality = 3
|
||||
// Similarity = 0/3 = 0.0
|
||||
// Distance = 1.0
|
||||
|
||||
if distance != 1.0 {
|
||||
t.Errorf("Expected distance 1.0 when one set is empty, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 0.0 {
|
||||
t.Errorf("Expected similarity 0.0 when one set is empty, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceDifferentK(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
|
||||
ks2 := NewKmerSet(7)
|
||||
ks2.AddKmerCode(1)
|
||||
|
||||
defer func() {
|
||||
if r := recover(); r == nil {
|
||||
t.Errorf("Expected panic when computing Jaccard distance with different k values")
|
||||
}
|
||||
}()
|
||||
|
||||
_ = ks1.JaccardDistance(ks2)
|
||||
}
|
||||
|
||||
func TestJaccardDistanceSimilarityRelation(t *testing.T) {
|
||||
// Test that distance + similarity = 1.0 for all cases
|
||||
testCases := []struct {
|
||||
name string
|
||||
ks1 *KmerSet
|
||||
ks2 *KmerSet
|
||||
}{
|
||||
{
|
||||
name: "partial overlap",
|
||||
ks1: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(1)
|
||||
ks.AddKmerCode(2)
|
||||
ks.AddKmerCode(3)
|
||||
return ks
|
||||
}(),
|
||||
ks2: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(2)
|
||||
ks.AddKmerCode(3)
|
||||
ks.AddKmerCode(4)
|
||||
ks.AddKmerCode(5)
|
||||
return ks
|
||||
}(),
|
||||
},
|
||||
{
|
||||
name: "identical",
|
||||
ks1: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(10)
|
||||
ks.AddKmerCode(20)
|
||||
return ks
|
||||
}(),
|
||||
ks2: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(10)
|
||||
ks.AddKmerCode(20)
|
||||
return ks
|
||||
}(),
|
||||
},
|
||||
{
|
||||
name: "disjoint",
|
||||
ks1: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(1)
|
||||
return ks
|
||||
}(),
|
||||
ks2: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(100)
|
||||
return ks
|
||||
}(),
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
distance := tc.ks1.JaccardDistance(tc.ks2)
|
||||
similarity := tc.ks1.JaccardSimilarity(tc.ks2)
|
||||
|
||||
sum := distance + similarity
|
||||
|
||||
if math.Abs(sum-1.0) > 1e-10 {
|
||||
t.Errorf("Expected distance + similarity = 1.0, got %f + %f = %f",
|
||||
distance, similarity, sum)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceSymmetry(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
ks1.AddKmerCode(3)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(2)
|
||||
ks2.AddKmerCode(3)
|
||||
ks2.AddKmerCode(4)
|
||||
|
||||
distance1 := ks1.JaccardDistance(ks2)
|
||||
distance2 := ks2.JaccardDistance(ks1)
|
||||
|
||||
similarity1 := ks1.JaccardSimilarity(ks2)
|
||||
similarity2 := ks2.JaccardSimilarity(ks1)
|
||||
|
||||
if math.Abs(distance1-distance2) > 1e-10 {
|
||||
t.Errorf("Jaccard distance not symmetric: %f vs %f", distance1, distance2)
|
||||
}
|
||||
|
||||
if math.Abs(similarity1-similarity2) > 1e-10 {
|
||||
t.Errorf("Jaccard similarity not symmetric: %f vs %f", similarity1, similarity2)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user