mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup
Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup This commit introduces Jaccard distance and similarity methods for KmerSet and KmerSetGroup. For KmerSet: - Added JaccardDistance method to compute the Jaccard distance between two KmerSets - Added JaccardSimilarity method to compute the Jaccard similarity between two KmerSets For KmerSetGroup: - Added JaccardDistanceMatrix method to compute a pairwise Jaccard distance matrix - Added JaccardSimilarityMatrix method to compute a pairwise Jaccard similarity matrix Also includes: - New DistMatrix implementation in pkg/obidist for storing and computing distance/similarity matrices - Updated version handling with bump-version target in Makefile - Added tests for all new methods
This commit is contained in:
231
pkg/obikmer/kmer_set_group_jaccard_test.go
Normal file
231
pkg/obikmer/kmer_set_group_jaccard_test.go
Normal file
@@ -0,0 +1,231 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestKmerSetGroupJaccardDistanceMatrix(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Set 0: {1, 2, 3}
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
ksg.Get(0).AddKmerCode(3)
|
||||
ksg.Get(0).SetId("set_A")
|
||||
|
||||
// Set 1: {2, 3, 4}
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
ksg.Get(1).AddKmerCode(4)
|
||||
ksg.Get(1).SetId("set_B")
|
||||
|
||||
// Set 2: {5, 6, 7}
|
||||
ksg.Get(2).AddKmerCode(5)
|
||||
ksg.Get(2).AddKmerCode(6)
|
||||
ksg.Get(2).AddKmerCode(7)
|
||||
ksg.Get(2).SetId("set_C")
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
// Check labels
|
||||
if dm.GetLabel(0) != "set_A" {
|
||||
t.Errorf("Expected label 'set_A' at index 0, got '%s'", dm.GetLabel(0))
|
||||
}
|
||||
if dm.GetLabel(1) != "set_B" {
|
||||
t.Errorf("Expected label 'set_B' at index 1, got '%s'", dm.GetLabel(1))
|
||||
}
|
||||
if dm.GetLabel(2) != "set_C" {
|
||||
t.Errorf("Expected label 'set_C' at index 2, got '%s'", dm.GetLabel(2))
|
||||
}
|
||||
|
||||
// Check distances
|
||||
// Distance(0, 1):
|
||||
// Intersection: {2, 3} -> 2 elements
|
||||
// Union: {1, 2, 3, 4} -> 4 elements
|
||||
// Similarity: 2/4 = 0.5
|
||||
// Distance: 1 - 0.5 = 0.5
|
||||
expectedDist01 := 0.5
|
||||
actualDist01 := dm.Get(0, 1)
|
||||
if math.Abs(actualDist01-expectedDist01) > 1e-10 {
|
||||
t.Errorf("Distance(0, 1): expected %f, got %f", expectedDist01, actualDist01)
|
||||
}
|
||||
|
||||
// Distance(0, 2):
|
||||
// Intersection: {} -> 0 elements
|
||||
// Union: {1, 2, 3, 5, 6, 7} -> 6 elements
|
||||
// Similarity: 0/6 = 0
|
||||
// Distance: 1 - 0 = 1.0
|
||||
expectedDist02 := 1.0
|
||||
actualDist02 := dm.Get(0, 2)
|
||||
if math.Abs(actualDist02-expectedDist02) > 1e-10 {
|
||||
t.Errorf("Distance(0, 2): expected %f, got %f", expectedDist02, actualDist02)
|
||||
}
|
||||
|
||||
// Distance(1, 2):
|
||||
// Intersection: {} -> 0 elements
|
||||
// Union: {2, 3, 4, 5, 6, 7} -> 6 elements
|
||||
// Similarity: 0/6 = 0
|
||||
// Distance: 1 - 0 = 1.0
|
||||
expectedDist12 := 1.0
|
||||
actualDist12 := dm.Get(1, 2)
|
||||
if math.Abs(actualDist12-expectedDist12) > 1e-10 {
|
||||
t.Errorf("Distance(1, 2): expected %f, got %f", expectedDist12, actualDist12)
|
||||
}
|
||||
|
||||
// Check symmetry
|
||||
if dm.Get(0, 1) != dm.Get(1, 0) {
|
||||
t.Errorf("Matrix not symmetric: Get(0, 1) = %f, Get(1, 0) = %f",
|
||||
dm.Get(0, 1), dm.Get(1, 0))
|
||||
}
|
||||
|
||||
// Check diagonal
|
||||
if dm.Get(0, 0) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(0, 0))
|
||||
}
|
||||
if dm.Get(1, 1) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(1, 1))
|
||||
}
|
||||
if dm.Get(2, 2) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(2, 2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardSimilarityMatrix(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Set 0: {1, 2, 3}
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
ksg.Get(0).AddKmerCode(3)
|
||||
|
||||
// Set 1: {2, 3, 4}
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
ksg.Get(1).AddKmerCode(4)
|
||||
|
||||
// Set 2: {1, 2, 3} (same as set 0)
|
||||
ksg.Get(2).AddKmerCode(1)
|
||||
ksg.Get(2).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
sm := ksg.JaccardSimilarityMatrix()
|
||||
|
||||
// Check similarities
|
||||
// Similarity(0, 1): 0.5 (as calculated above)
|
||||
expectedSim01 := 0.5
|
||||
actualSim01 := sm.Get(0, 1)
|
||||
if math.Abs(actualSim01-expectedSim01) > 1e-10 {
|
||||
t.Errorf("Similarity(0, 1): expected %f, got %f", expectedSim01, actualSim01)
|
||||
}
|
||||
|
||||
// Similarity(0, 2): 1.0 (identical sets)
|
||||
expectedSim02 := 1.0
|
||||
actualSim02 := sm.Get(0, 2)
|
||||
if math.Abs(actualSim02-expectedSim02) > 1e-10 {
|
||||
t.Errorf("Similarity(0, 2): expected %f, got %f", expectedSim02, actualSim02)
|
||||
}
|
||||
|
||||
// Similarity(1, 2): 0.5
|
||||
// Intersection: {2, 3} -> 2
|
||||
// Union: {1, 2, 3, 4} -> 4
|
||||
// Similarity: 2/4 = 0.5
|
||||
expectedSim12 := 0.5
|
||||
actualSim12 := sm.Get(1, 2)
|
||||
if math.Abs(actualSim12-expectedSim12) > 1e-10 {
|
||||
t.Errorf("Similarity(1, 2): expected %f, got %f", expectedSim12, actualSim12)
|
||||
}
|
||||
|
||||
// Check diagonal (similarity to self = 1.0)
|
||||
if sm.Get(0, 0) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(0, 0))
|
||||
}
|
||||
if sm.Get(1, 1) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(1, 1))
|
||||
}
|
||||
if sm.Get(2, 2) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(2, 2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatricesRelation(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 4)
|
||||
|
||||
// Create different sets
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
|
||||
ksg.Get(2).AddKmerCode(1)
|
||||
ksg.Get(2).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
ksg.Get(3).AddKmerCode(10)
|
||||
ksg.Get(3).AddKmerCode(20)
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
sm := ksg.JaccardSimilarityMatrix()
|
||||
|
||||
// For all pairs (including diagonal), distance + similarity should equal 1.0
|
||||
for i := 0; i < 4; i++ {
|
||||
for j := 0; j < 4; j++ {
|
||||
distance := dm.Get(i, j)
|
||||
similarity := sm.Get(i, j)
|
||||
sum := distance + similarity
|
||||
|
||||
if math.Abs(sum-1.0) > 1e-10 {
|
||||
t.Errorf("At (%d, %d): distance %f + similarity %f = %f, expected 1.0",
|
||||
i, j, distance, similarity, sum)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatrixLabels(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Don't set IDs - should use default labels
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
// Check default labels
|
||||
if dm.GetLabel(0) != "set_0" {
|
||||
t.Errorf("Expected default label 'set_0', got '%s'", dm.GetLabel(0))
|
||||
}
|
||||
if dm.GetLabel(1) != "set_1" {
|
||||
t.Errorf("Expected default label 'set_1', got '%s'", dm.GetLabel(1))
|
||||
}
|
||||
if dm.GetLabel(2) != "set_2" {
|
||||
t.Errorf("Expected default label 'set_2', got '%s'", dm.GetLabel(2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatrixSize(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 5)
|
||||
|
||||
for i := 0; i < 5; i++ {
|
||||
ksg.Get(i).AddKmerCode(uint64(i))
|
||||
}
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
if dm.Size() != 5 {
|
||||
t.Errorf("Expected matrix size 5, got %d", dm.Size())
|
||||
}
|
||||
|
||||
// All sets are disjoint, so all distances should be 1.0
|
||||
for i := 0; i < 5; i++ {
|
||||
for j := i + 1; j < 5; j++ {
|
||||
dist := dm.Get(i, j)
|
||||
if math.Abs(dist-1.0) > 1e-10 {
|
||||
t.Errorf("Expected distance 1.0 for disjoint sets (%d, %d), got %f",
|
||||
i, j, dist)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user