Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup

Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup

This commit introduces Jaccard distance and similarity methods for KmerSet and KmerSetGroup.

For KmerSet:
- Added JaccardDistance method to compute the Jaccard distance between two KmerSets
- Added JaccardSimilarity method to compute the Jaccard similarity between two KmerSets

For KmerSetGroup:
- Added JaccardDistanceMatrix method to compute a pairwise Jaccard distance matrix
- Added JaccardSimilarityMatrix method to compute a pairwise Jaccard similarity matrix

Also includes:
- New DistMatrix implementation in pkg/obidist for storing and computing distance/similarity matrices
- Updated version handling with bump-version target in Makefile
- Added tests for all new methods
This commit is contained in:
Eric Coissac
2026-02-05 17:38:47 +01:00
parent aa2e94dd6f
commit e3c41fc11b
9 changed files with 1328 additions and 28 deletions

View File

@@ -0,0 +1,231 @@
package obikmer
import (
"math"
"testing"
)
func TestKmerSetGroupJaccardDistanceMatrix(t *testing.T) {
ksg := NewKmerSetGroup(5, 3)
// Set 0: {1, 2, 3}
ksg.Get(0).AddKmerCode(1)
ksg.Get(0).AddKmerCode(2)
ksg.Get(0).AddKmerCode(3)
ksg.Get(0).SetId("set_A")
// Set 1: {2, 3, 4}
ksg.Get(1).AddKmerCode(2)
ksg.Get(1).AddKmerCode(3)
ksg.Get(1).AddKmerCode(4)
ksg.Get(1).SetId("set_B")
// Set 2: {5, 6, 7}
ksg.Get(2).AddKmerCode(5)
ksg.Get(2).AddKmerCode(6)
ksg.Get(2).AddKmerCode(7)
ksg.Get(2).SetId("set_C")
dm := ksg.JaccardDistanceMatrix()
// Check labels
if dm.GetLabel(0) != "set_A" {
t.Errorf("Expected label 'set_A' at index 0, got '%s'", dm.GetLabel(0))
}
if dm.GetLabel(1) != "set_B" {
t.Errorf("Expected label 'set_B' at index 1, got '%s'", dm.GetLabel(1))
}
if dm.GetLabel(2) != "set_C" {
t.Errorf("Expected label 'set_C' at index 2, got '%s'", dm.GetLabel(2))
}
// Check distances
// Distance(0, 1):
// Intersection: {2, 3} -> 2 elements
// Union: {1, 2, 3, 4} -> 4 elements
// Similarity: 2/4 = 0.5
// Distance: 1 - 0.5 = 0.5
expectedDist01 := 0.5
actualDist01 := dm.Get(0, 1)
if math.Abs(actualDist01-expectedDist01) > 1e-10 {
t.Errorf("Distance(0, 1): expected %f, got %f", expectedDist01, actualDist01)
}
// Distance(0, 2):
// Intersection: {} -> 0 elements
// Union: {1, 2, 3, 5, 6, 7} -> 6 elements
// Similarity: 0/6 = 0
// Distance: 1 - 0 = 1.0
expectedDist02 := 1.0
actualDist02 := dm.Get(0, 2)
if math.Abs(actualDist02-expectedDist02) > 1e-10 {
t.Errorf("Distance(0, 2): expected %f, got %f", expectedDist02, actualDist02)
}
// Distance(1, 2):
// Intersection: {} -> 0 elements
// Union: {2, 3, 4, 5, 6, 7} -> 6 elements
// Similarity: 0/6 = 0
// Distance: 1 - 0 = 1.0
expectedDist12 := 1.0
actualDist12 := dm.Get(1, 2)
if math.Abs(actualDist12-expectedDist12) > 1e-10 {
t.Errorf("Distance(1, 2): expected %f, got %f", expectedDist12, actualDist12)
}
// Check symmetry
if dm.Get(0, 1) != dm.Get(1, 0) {
t.Errorf("Matrix not symmetric: Get(0, 1) = %f, Get(1, 0) = %f",
dm.Get(0, 1), dm.Get(1, 0))
}
// Check diagonal
if dm.Get(0, 0) != 0.0 {
t.Errorf("Diagonal should be 0, got %f", dm.Get(0, 0))
}
if dm.Get(1, 1) != 0.0 {
t.Errorf("Diagonal should be 0, got %f", dm.Get(1, 1))
}
if dm.Get(2, 2) != 0.0 {
t.Errorf("Diagonal should be 0, got %f", dm.Get(2, 2))
}
}
func TestKmerSetGroupJaccardSimilarityMatrix(t *testing.T) {
ksg := NewKmerSetGroup(5, 3)
// Set 0: {1, 2, 3}
ksg.Get(0).AddKmerCode(1)
ksg.Get(0).AddKmerCode(2)
ksg.Get(0).AddKmerCode(3)
// Set 1: {2, 3, 4}
ksg.Get(1).AddKmerCode(2)
ksg.Get(1).AddKmerCode(3)
ksg.Get(1).AddKmerCode(4)
// Set 2: {1, 2, 3} (same as set 0)
ksg.Get(2).AddKmerCode(1)
ksg.Get(2).AddKmerCode(2)
ksg.Get(2).AddKmerCode(3)
sm := ksg.JaccardSimilarityMatrix()
// Check similarities
// Similarity(0, 1): 0.5 (as calculated above)
expectedSim01 := 0.5
actualSim01 := sm.Get(0, 1)
if math.Abs(actualSim01-expectedSim01) > 1e-10 {
t.Errorf("Similarity(0, 1): expected %f, got %f", expectedSim01, actualSim01)
}
// Similarity(0, 2): 1.0 (identical sets)
expectedSim02 := 1.0
actualSim02 := sm.Get(0, 2)
if math.Abs(actualSim02-expectedSim02) > 1e-10 {
t.Errorf("Similarity(0, 2): expected %f, got %f", expectedSim02, actualSim02)
}
// Similarity(1, 2): 0.5
// Intersection: {2, 3} -> 2
// Union: {1, 2, 3, 4} -> 4
// Similarity: 2/4 = 0.5
expectedSim12 := 0.5
actualSim12 := sm.Get(1, 2)
if math.Abs(actualSim12-expectedSim12) > 1e-10 {
t.Errorf("Similarity(1, 2): expected %f, got %f", expectedSim12, actualSim12)
}
// Check diagonal (similarity to self = 1.0)
if sm.Get(0, 0) != 1.0 {
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(0, 0))
}
if sm.Get(1, 1) != 1.0 {
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(1, 1))
}
if sm.Get(2, 2) != 1.0 {
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(2, 2))
}
}
func TestKmerSetGroupJaccardMatricesRelation(t *testing.T) {
ksg := NewKmerSetGroup(5, 4)
// Create different sets
ksg.Get(0).AddKmerCode(1)
ksg.Get(0).AddKmerCode(2)
ksg.Get(1).AddKmerCode(2)
ksg.Get(1).AddKmerCode(3)
ksg.Get(2).AddKmerCode(1)
ksg.Get(2).AddKmerCode(2)
ksg.Get(2).AddKmerCode(3)
ksg.Get(3).AddKmerCode(10)
ksg.Get(3).AddKmerCode(20)
dm := ksg.JaccardDistanceMatrix()
sm := ksg.JaccardSimilarityMatrix()
// For all pairs (including diagonal), distance + similarity should equal 1.0
for i := 0; i < 4; i++ {
for j := 0; j < 4; j++ {
distance := dm.Get(i, j)
similarity := sm.Get(i, j)
sum := distance + similarity
if math.Abs(sum-1.0) > 1e-10 {
t.Errorf("At (%d, %d): distance %f + similarity %f = %f, expected 1.0",
i, j, distance, similarity, sum)
}
}
}
}
func TestKmerSetGroupJaccardMatrixLabels(t *testing.T) {
ksg := NewKmerSetGroup(5, 3)
// Don't set IDs - should use default labels
ksg.Get(0).AddKmerCode(1)
ksg.Get(1).AddKmerCode(2)
ksg.Get(2).AddKmerCode(3)
dm := ksg.JaccardDistanceMatrix()
// Check default labels
if dm.GetLabel(0) != "set_0" {
t.Errorf("Expected default label 'set_0', got '%s'", dm.GetLabel(0))
}
if dm.GetLabel(1) != "set_1" {
t.Errorf("Expected default label 'set_1', got '%s'", dm.GetLabel(1))
}
if dm.GetLabel(2) != "set_2" {
t.Errorf("Expected default label 'set_2', got '%s'", dm.GetLabel(2))
}
}
func TestKmerSetGroupJaccardMatrixSize(t *testing.T) {
ksg := NewKmerSetGroup(5, 5)
for i := 0; i < 5; i++ {
ksg.Get(i).AddKmerCode(uint64(i))
}
dm := ksg.JaccardDistanceMatrix()
if dm.Size() != 5 {
t.Errorf("Expected matrix size 5, got %d", dm.Size())
}
// All sets are disjoint, so all distances should be 1.0
for i := 0; i < 5; i++ {
for j := i + 1; j < 5; j++ {
dist := dm.Get(i, j)
if math.Abs(dist-1.0) > 1e-10 {
t.Errorf("Expected distance 1.0 for disjoint sets (%d, %d), got %f",
i, j, dist)
}
}
}
}