mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup
Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup This commit introduces Jaccard distance and similarity methods for KmerSet and KmerSetGroup. For KmerSet: - Added JaccardDistance method to compute the Jaccard distance between two KmerSets - Added JaccardSimilarity method to compute the Jaccard similarity between two KmerSets For KmerSetGroup: - Added JaccardDistanceMatrix method to compute a pairwise Jaccard distance matrix - Added JaccardSimilarityMatrix method to compute a pairwise Jaccard similarity matrix Also includes: - New DistMatrix implementation in pkg/obidist for storing and computing distance/similarity matrices - Updated version handling with bump-version target in Makefile - Added tests for all new methods
This commit is contained in:
272
pkg/obidist/dist_matrix.go
Normal file
272
pkg/obidist/dist_matrix.go
Normal file
@@ -0,0 +1,272 @@
|
||||
package obidist
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// DistMatrix represents a symmetric matrix stored as a triangular matrix.
|
||||
// The diagonal has a constant value (typically 0 for distances, 1 for similarities).
|
||||
// Only the upper triangle (i < j) is stored to save memory.
|
||||
//
|
||||
// For an n×n matrix, we store n(n-1)/2 values.
|
||||
type DistMatrix struct {
|
||||
n int // Number of elements (matrix dimension)
|
||||
data []float64 // Triangular storage: upper triangle only
|
||||
labels []string // Optional labels for rows/columns
|
||||
diagonalValue float64 // Value on the diagonal
|
||||
}
|
||||
|
||||
// NewDistMatrix creates a new distance matrix of size n×n.
|
||||
// All distances are initialized to 0.0, diagonal is 0.0.
|
||||
func NewDistMatrix(n int) *DistMatrix {
|
||||
if n < 0 {
|
||||
panic("matrix size must be non-negative")
|
||||
}
|
||||
|
||||
// Number of elements in upper triangle: n(n-1)/2
|
||||
size := n * (n - 1) / 2
|
||||
|
||||
return &DistMatrix{
|
||||
n: n,
|
||||
data: make([]float64, size),
|
||||
labels: make([]string, n),
|
||||
diagonalValue: 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
// NewDistMatrixWithLabels creates a new distance matrix with labels.
|
||||
// Diagonal is 0.0 by default.
|
||||
func NewDistMatrixWithLabels(labels []string) *DistMatrix {
|
||||
dm := NewDistMatrix(len(labels))
|
||||
copy(dm.labels, labels)
|
||||
return dm
|
||||
}
|
||||
|
||||
// NewSimilarityMatrix creates a new similarity matrix of size n×n.
|
||||
// All off-diagonal values are initialized to 0.0, diagonal is 1.0.
|
||||
func NewSimilarityMatrix(n int) *DistMatrix {
|
||||
if n < 0 {
|
||||
panic("matrix size must be non-negative")
|
||||
}
|
||||
|
||||
// Number of elements in upper triangle: n(n-1)/2
|
||||
size := n * (n - 1) / 2
|
||||
|
||||
return &DistMatrix{
|
||||
n: n,
|
||||
data: make([]float64, size),
|
||||
labels: make([]string, n),
|
||||
diagonalValue: 1.0,
|
||||
}
|
||||
}
|
||||
|
||||
// NewSimilarityMatrixWithLabels creates a new similarity matrix with labels.
|
||||
// Diagonal is 1.0.
|
||||
func NewSimilarityMatrixWithLabels(labels []string) *DistMatrix {
|
||||
dm := NewSimilarityMatrix(len(labels))
|
||||
copy(dm.labels, labels)
|
||||
return dm
|
||||
}
|
||||
|
||||
// Size returns the dimension of the matrix (n for an n×n matrix).
|
||||
func (dm *DistMatrix) Size() int {
|
||||
return dm.n
|
||||
}
|
||||
|
||||
// indexFor computes the index in the data array for element (i, j).
|
||||
// Assumes i < j (upper triangle).
|
||||
//
|
||||
// The upper triangle is stored row by row:
|
||||
// (0,1), (0,2), ..., (0,n-1), (1,2), (1,3), ..., (1,n-1), (2,3), ...
|
||||
//
|
||||
// For element (i, j) where i < j:
|
||||
// index = i*(n-1) + j - 1 - i*(i+1)/2
|
||||
//
|
||||
// This can be simplified to:
|
||||
// index = i*n - i*(i+1)/2 + j - i - 1
|
||||
// = i*(n - (i+1)/2 - 1) + j - 1
|
||||
// = i*(n - 1 - i/2 - 1/2) + j - 1
|
||||
//
|
||||
// But the clearest formula is:
|
||||
// index = i*n - i*(i+3)/2 + j - 1
|
||||
func (dm *DistMatrix) indexFor(i, j int) int {
|
||||
if i >= j {
|
||||
panic(fmt.Sprintf("indexFor expects i < j, got i=%d, j=%d", i, j))
|
||||
}
|
||||
// Formula: number of elements in previous rows + position in current row
|
||||
// Previous rows (0 to i-1): sum from k=0 to i-1 of (n-1-k) = i*n - i*(i+1)/2
|
||||
// Current row position: j - i - 1
|
||||
return i*dm.n - i*(i+1)/2 + j - i - 1
|
||||
}
|
||||
|
||||
// Get returns the value at position (i, j).
|
||||
// The matrix is symmetric, so Get(i, j) == Get(j, i).
|
||||
// The diagonal returns the diagonalValue (0.0 for distances, 1.0 for similarities).
|
||||
func (dm *DistMatrix) Get(i, j int) float64 {
|
||||
if i < 0 || i >= dm.n || j < 0 || j >= dm.n {
|
||||
panic(fmt.Sprintf("indices out of bounds: i=%d, j=%d, n=%d", i, j, dm.n))
|
||||
}
|
||||
|
||||
// Diagonal: return the diagonal value
|
||||
if i == j {
|
||||
return dm.diagonalValue
|
||||
}
|
||||
|
||||
// Ensure i < j for indexing
|
||||
if i > j {
|
||||
i, j = j, i
|
||||
}
|
||||
|
||||
return dm.data[dm.indexFor(i, j)]
|
||||
}
|
||||
|
||||
// Set sets the value at position (i, j).
|
||||
// The matrix is symmetric, so Set(i, j, v) also sets (j, i) to v.
|
||||
// Setting the diagonal (i == j) is ignored (diagonal has a fixed value).
|
||||
func (dm *DistMatrix) Set(i, j int, value float64) {
|
||||
if i < 0 || i >= dm.n || j < 0 || j >= dm.n {
|
||||
panic(fmt.Sprintf("indices out of bounds: i=%d, j=%d, n=%d", i, j, dm.n))
|
||||
}
|
||||
|
||||
// Ignore diagonal assignments (diagonal has a fixed value)
|
||||
if i == j {
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure i < j for indexing
|
||||
if i > j {
|
||||
i, j = j, i
|
||||
}
|
||||
|
||||
dm.data[dm.indexFor(i, j)] = value
|
||||
}
|
||||
|
||||
// GetLabel returns the label for element i.
|
||||
func (dm *DistMatrix) GetLabel(i int) string {
|
||||
if i < 0 || i >= dm.n {
|
||||
panic(fmt.Sprintf("index out of bounds: i=%d, n=%d", i, dm.n))
|
||||
}
|
||||
return dm.labels[i]
|
||||
}
|
||||
|
||||
// SetLabel sets the label for element i.
|
||||
func (dm *DistMatrix) SetLabel(i int, label string) {
|
||||
if i < 0 || i >= dm.n {
|
||||
panic(fmt.Sprintf("index out of bounds: i=%d, n=%d", i, dm.n))
|
||||
}
|
||||
dm.labels[i] = label
|
||||
}
|
||||
|
||||
// Labels returns a copy of all labels.
|
||||
func (dm *DistMatrix) Labels() []string {
|
||||
labels := make([]string, dm.n)
|
||||
copy(labels, dm.labels)
|
||||
return labels
|
||||
}
|
||||
|
||||
// GetRow returns the i-th row of the distance matrix.
|
||||
// The returned slice is a copy.
|
||||
func (dm *DistMatrix) GetRow(i int) []float64 {
|
||||
if i < 0 || i >= dm.n {
|
||||
panic(fmt.Sprintf("index out of bounds: i=%d, n=%d", i, dm.n))
|
||||
}
|
||||
|
||||
row := make([]float64, dm.n)
|
||||
for j := 0; j < dm.n; j++ {
|
||||
row[j] = dm.Get(i, j)
|
||||
}
|
||||
return row
|
||||
}
|
||||
|
||||
// GetColumn returns the j-th column of the distance matrix.
|
||||
// Since the matrix is symmetric, GetColumn(j) == GetRow(j).
|
||||
// The returned slice is a copy.
|
||||
func (dm *DistMatrix) GetColumn(j int) []float64 {
|
||||
return dm.GetRow(j)
|
||||
}
|
||||
|
||||
// MinDistance returns the minimum non-zero distance in the matrix,
|
||||
// along with the indices (i, j) where it occurs.
|
||||
// Returns (0.0, -1, -1) if the matrix is empty or all distances are 0.
|
||||
func (dm *DistMatrix) MinDistance() (float64, int, int) {
|
||||
if dm.n <= 1 {
|
||||
return 0.0, -1, -1
|
||||
}
|
||||
|
||||
minDist := -1.0
|
||||
minI, minJ := -1, -1
|
||||
|
||||
for i := 0; i < dm.n-1; i++ {
|
||||
for j := i + 1; j < dm.n; j++ {
|
||||
dist := dm.Get(i, j)
|
||||
if minDist < 0 || dist < minDist {
|
||||
minDist = dist
|
||||
minI = i
|
||||
minJ = j
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if minI < 0 {
|
||||
return 0.0, -1, -1
|
||||
}
|
||||
|
||||
return minDist, minI, minJ
|
||||
}
|
||||
|
||||
// MaxDistance returns the maximum distance in the matrix,
|
||||
// along with the indices (i, j) where it occurs.
|
||||
// Returns (0.0, -1, -1) if the matrix is empty.
|
||||
func (dm *DistMatrix) MaxDistance() (float64, int, int) {
|
||||
if dm.n <= 1 {
|
||||
return 0.0, -1, -1
|
||||
}
|
||||
|
||||
maxDist := -1.0
|
||||
maxI, maxJ := -1, -1
|
||||
|
||||
for i := 0; i < dm.n-1; i++ {
|
||||
for j := i + 1; j < dm.n; j++ {
|
||||
dist := dm.Get(i, j)
|
||||
if maxDist < 0 || dist > maxDist {
|
||||
maxDist = dist
|
||||
maxI = i
|
||||
maxJ = j
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if maxI < 0 {
|
||||
return 0.0, -1, -1
|
||||
}
|
||||
|
||||
return maxDist, maxI, maxJ
|
||||
}
|
||||
|
||||
// Copy creates a deep copy of the matrix.
|
||||
func (dm *DistMatrix) Copy() *DistMatrix {
|
||||
newDM := &DistMatrix{
|
||||
n: dm.n,
|
||||
data: make([]float64, len(dm.data)),
|
||||
labels: make([]string, dm.n),
|
||||
diagonalValue: dm.diagonalValue,
|
||||
}
|
||||
|
||||
copy(newDM.data, dm.data)
|
||||
copy(newDM.labels, dm.labels)
|
||||
|
||||
return newDM
|
||||
}
|
||||
|
||||
// ToFullMatrix returns a full n×n matrix representation.
|
||||
// This allocates n² values, so use only when needed.
|
||||
func (dm *DistMatrix) ToFullMatrix() [][]float64 {
|
||||
matrix := make([][]float64, dm.n)
|
||||
for i := 0; i < dm.n; i++ {
|
||||
matrix[i] = make([]float64, dm.n)
|
||||
for j := 0; j < dm.n; j++ {
|
||||
matrix[i][j] = dm.Get(i, j)
|
||||
}
|
||||
}
|
||||
return matrix
|
||||
}
|
||||
386
pkg/obidist/dist_matrix_test.go
Normal file
386
pkg/obidist/dist_matrix_test.go
Normal file
@@ -0,0 +1,386 @@
|
||||
package obidist
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNewDistMatrix(t *testing.T) {
|
||||
dm := NewDistMatrix(5)
|
||||
|
||||
if dm.Size() != 5 {
|
||||
t.Errorf("Expected size 5, got %d", dm.Size())
|
||||
}
|
||||
|
||||
// Check that all values are initialized to 0
|
||||
for i := 0; i < 5; i++ {
|
||||
for j := 0; j < 5; j++ {
|
||||
if dm.Get(i, j) != 0.0 {
|
||||
t.Errorf("Expected 0.0 at (%d, %d), got %f", i, j, dm.Get(i, j))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistMatrixDiagonal(t *testing.T) {
|
||||
dm := NewDistMatrix(5)
|
||||
|
||||
// Diagonal should always be 0
|
||||
for i := 0; i < 5; i++ {
|
||||
if dm.Get(i, i) != 0.0 {
|
||||
t.Errorf("Expected diagonal element (%d, %d) to be 0.0, got %f", i, i, dm.Get(i, i))
|
||||
}
|
||||
}
|
||||
|
||||
// Try to set diagonal (should be ignored)
|
||||
dm.Set(2, 2, 5.0)
|
||||
if dm.Get(2, 2) != 0.0 {
|
||||
t.Errorf("Diagonal should remain 0.0 even after Set, got %f", dm.Get(2, 2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistMatrixSymmetry(t *testing.T) {
|
||||
dm := NewDistMatrix(4)
|
||||
|
||||
dm.Set(0, 1, 1.5)
|
||||
dm.Set(0, 2, 2.5)
|
||||
dm.Set(1, 3, 3.5)
|
||||
|
||||
// Check symmetry
|
||||
if dm.Get(0, 1) != dm.Get(1, 0) {
|
||||
t.Errorf("Matrix not symmetric: Get(0,1)=%f, Get(1,0)=%f", dm.Get(0, 1), dm.Get(1, 0))
|
||||
}
|
||||
|
||||
if dm.Get(0, 2) != dm.Get(2, 0) {
|
||||
t.Errorf("Matrix not symmetric: Get(0,2)=%f, Get(2,0)=%f", dm.Get(0, 2), dm.Get(2, 0))
|
||||
}
|
||||
|
||||
if dm.Get(1, 3) != dm.Get(3, 1) {
|
||||
t.Errorf("Matrix not symmetric: Get(1,3)=%f, Get(3,1)=%f", dm.Get(1, 3), dm.Get(3, 1))
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistMatrixSetGet(t *testing.T) {
|
||||
dm := NewDistMatrix(4)
|
||||
|
||||
testCases := []struct {
|
||||
i int
|
||||
j int
|
||||
value float64
|
||||
}{
|
||||
{0, 1, 1.5},
|
||||
{0, 2, 2.5},
|
||||
{0, 3, 3.5},
|
||||
{1, 2, 4.5},
|
||||
{1, 3, 5.5},
|
||||
{2, 3, 6.5},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
dm.Set(tc.i, tc.j, tc.value)
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
got := dm.Get(tc.i, tc.j)
|
||||
if math.Abs(got-tc.value) > 1e-10 {
|
||||
t.Errorf("Get(%d, %d): expected %f, got %f", tc.i, tc.j, tc.value, got)
|
||||
}
|
||||
|
||||
// Check symmetry
|
||||
got = dm.Get(tc.j, tc.i)
|
||||
if math.Abs(got-tc.value) > 1e-10 {
|
||||
t.Errorf("Get(%d, %d) (symmetric): expected %f, got %f", tc.j, tc.i, tc.value, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistMatrixLabels(t *testing.T) {
|
||||
labels := []string{"A", "B", "C", "D"}
|
||||
dm := NewDistMatrixWithLabels(labels)
|
||||
|
||||
if dm.Size() != 4 {
|
||||
t.Errorf("Expected size 4, got %d", dm.Size())
|
||||
}
|
||||
|
||||
for i, label := range labels {
|
||||
if dm.GetLabel(i) != label {
|
||||
t.Errorf("Expected label %s at index %d, got %s", label, i, dm.GetLabel(i))
|
||||
}
|
||||
}
|
||||
|
||||
// Modify a label
|
||||
dm.SetLabel(1, "Modified")
|
||||
if dm.GetLabel(1) != "Modified" {
|
||||
t.Errorf("Expected label 'Modified' at index 1, got %s", dm.GetLabel(1))
|
||||
}
|
||||
|
||||
// Check Labels() returns a copy
|
||||
labelsCopy := dm.Labels()
|
||||
labelsCopy[0] = "ChangedCopy"
|
||||
if dm.GetLabel(0) != "A" {
|
||||
t.Errorf("Modifying Labels() return value should not affect original labels")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistMatrixMinDistance(t *testing.T) {
|
||||
dm := NewDistMatrix(4)
|
||||
|
||||
dm.Set(0, 1, 2.5)
|
||||
dm.Set(0, 2, 1.5) // minimum
|
||||
dm.Set(0, 3, 3.5)
|
||||
dm.Set(1, 2, 4.5)
|
||||
dm.Set(1, 3, 5.5)
|
||||
dm.Set(2, 3, 6.5)
|
||||
|
||||
minDist, minI, minJ := dm.MinDistance()
|
||||
|
||||
if math.Abs(minDist-1.5) > 1e-10 {
|
||||
t.Errorf("Expected min distance 1.5, got %f", minDist)
|
||||
}
|
||||
|
||||
if (minI != 0 || minJ != 2) && (minI != 2 || minJ != 0) {
|
||||
t.Errorf("Expected min at (0, 2) or (2, 0), got (%d, %d)", minI, minJ)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistMatrixMaxDistance(t *testing.T) {
|
||||
dm := NewDistMatrix(4)
|
||||
|
||||
dm.Set(0, 1, 2.5)
|
||||
dm.Set(0, 2, 1.5)
|
||||
dm.Set(0, 3, 3.5)
|
||||
dm.Set(1, 2, 4.5)
|
||||
dm.Set(1, 3, 5.5)
|
||||
dm.Set(2, 3, 6.5) // maximum
|
||||
|
||||
maxDist, maxI, maxJ := dm.MaxDistance()
|
||||
|
||||
if math.Abs(maxDist-6.5) > 1e-10 {
|
||||
t.Errorf("Expected max distance 6.5, got %f", maxDist)
|
||||
}
|
||||
|
||||
if (maxI != 2 || maxJ != 3) && (maxI != 3 || maxJ != 2) {
|
||||
t.Errorf("Expected max at (2, 3) or (3, 2), got (%d, %d)", maxI, maxJ)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistMatrixGetRow(t *testing.T) {
|
||||
dm := NewDistMatrix(3)
|
||||
|
||||
dm.Set(0, 1, 1.0)
|
||||
dm.Set(0, 2, 2.0)
|
||||
dm.Set(1, 2, 3.0)
|
||||
|
||||
row0 := dm.GetRow(0)
|
||||
expected0 := []float64{0.0, 1.0, 2.0}
|
||||
|
||||
for i, val := range expected0 {
|
||||
if math.Abs(row0[i]-val) > 1e-10 {
|
||||
t.Errorf("Row 0[%d]: expected %f, got %f", i, val, row0[i])
|
||||
}
|
||||
}
|
||||
|
||||
row1 := dm.GetRow(1)
|
||||
expected1 := []float64{1.0, 0.0, 3.0}
|
||||
|
||||
for i, val := range expected1 {
|
||||
if math.Abs(row1[i]-val) > 1e-10 {
|
||||
t.Errorf("Row 1[%d]: expected %f, got %f", i, val, row1[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistMatrixCopy(t *testing.T) {
|
||||
dm := NewDistMatrixWithLabels([]string{"A", "B", "C"})
|
||||
dm.Set(0, 1, 1.5)
|
||||
dm.Set(0, 2, 2.5)
|
||||
dm.Set(1, 2, 3.5)
|
||||
|
||||
dmCopy := dm.Copy()
|
||||
|
||||
// Check values are copied
|
||||
if dmCopy.Get(0, 1) != dm.Get(0, 1) {
|
||||
t.Errorf("Copy has different value at (0, 1)")
|
||||
}
|
||||
|
||||
// Check labels are copied
|
||||
if dmCopy.GetLabel(0) != dm.GetLabel(0) {
|
||||
t.Errorf("Copy has different label at index 0")
|
||||
}
|
||||
|
||||
// Modify copy and ensure original unchanged
|
||||
dmCopy.Set(0, 1, 99.9)
|
||||
if dm.Get(0, 1) == 99.9 {
|
||||
t.Errorf("Modifying copy affected original matrix")
|
||||
}
|
||||
|
||||
dmCopy.SetLabel(0, "Modified")
|
||||
if dm.GetLabel(0) == "Modified" {
|
||||
t.Errorf("Modifying copy label affected original matrix")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistMatrixToFullMatrix(t *testing.T) {
|
||||
dm := NewDistMatrix(3)
|
||||
dm.Set(0, 1, 1.0)
|
||||
dm.Set(0, 2, 2.0)
|
||||
dm.Set(1, 2, 3.0)
|
||||
|
||||
full := dm.ToFullMatrix()
|
||||
|
||||
expected := [][]float64{
|
||||
{0.0, 1.0, 2.0},
|
||||
{1.0, 0.0, 3.0},
|
||||
{2.0, 3.0, 0.0},
|
||||
}
|
||||
|
||||
for i := 0; i < 3; i++ {
|
||||
for j := 0; j < 3; j++ {
|
||||
if math.Abs(full[i][j]-expected[i][j]) > 1e-10 {
|
||||
t.Errorf("Full matrix[%d][%d]: expected %f, got %f",
|
||||
i, j, expected[i][j], full[i][j])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistMatrixBoundsChecking(t *testing.T) {
|
||||
dm := NewDistMatrix(3)
|
||||
|
||||
// Test Get out of bounds
|
||||
testPanic := func(f func()) {
|
||||
defer func() {
|
||||
if r := recover(); r == nil {
|
||||
t.Errorf("Expected panic, but didn't get one")
|
||||
}
|
||||
}()
|
||||
f()
|
||||
}
|
||||
|
||||
testPanic(func() { dm.Get(-1, 0) })
|
||||
testPanic(func() { dm.Get(0, 3) })
|
||||
testPanic(func() { dm.Set(3, 0, 1.0) })
|
||||
testPanic(func() { dm.GetLabel(-1) })
|
||||
testPanic(func() { dm.SetLabel(3, "Invalid") })
|
||||
testPanic(func() { dm.GetRow(3) })
|
||||
}
|
||||
|
||||
func TestDistMatrixEmptyMatrix(t *testing.T) {
|
||||
dm := NewDistMatrix(0)
|
||||
|
||||
if dm.Size() != 0 {
|
||||
t.Errorf("Expected size 0, got %d", dm.Size())
|
||||
}
|
||||
|
||||
minDist, minI, minJ := dm.MinDistance()
|
||||
if minDist != 0.0 || minI != -1 || minJ != -1 {
|
||||
t.Errorf("Empty matrix MinDistance should return (0.0, -1, -1), got (%f, %d, %d)",
|
||||
minDist, minI, minJ)
|
||||
}
|
||||
|
||||
maxDist, maxI, maxJ := dm.MaxDistance()
|
||||
if maxDist != 0.0 || maxI != -1 || maxJ != -1 {
|
||||
t.Errorf("Empty matrix MaxDistance should return (0.0, -1, -1), got (%f, %d, %d)",
|
||||
maxDist, maxI, maxJ)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistMatrixSingleElement(t *testing.T) {
|
||||
dm := NewDistMatrix(1)
|
||||
|
||||
if dm.Size() != 1 {
|
||||
t.Errorf("Expected size 1, got %d", dm.Size())
|
||||
}
|
||||
|
||||
// Only element is diagonal (always 0)
|
||||
if dm.Get(0, 0) != 0.0 {
|
||||
t.Errorf("Expected 0.0 at (0, 0), got %f", dm.Get(0, 0))
|
||||
}
|
||||
|
||||
minDist, minI, minJ := dm.MinDistance()
|
||||
if minDist != 0.0 || minI != -1 || minJ != -1 {
|
||||
t.Errorf("Single element matrix MinDistance should return (0.0, -1, -1), got (%f, %d, %d)",
|
||||
minDist, minI, minJ)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewSimilarityMatrix(t *testing.T) {
|
||||
sm := NewSimilarityMatrix(4)
|
||||
|
||||
if sm.Size() != 4 {
|
||||
t.Errorf("Expected size 4, got %d", sm.Size())
|
||||
}
|
||||
|
||||
// Check diagonal is 1.0
|
||||
for i := 0; i < 4; i++ {
|
||||
if sm.Get(i, i) != 1.0 {
|
||||
t.Errorf("Expected diagonal element (%d, %d) to be 1.0, got %f", i, i, sm.Get(i, i))
|
||||
}
|
||||
}
|
||||
|
||||
// Check off-diagonal is 0.0
|
||||
if sm.Get(0, 1) != 0.0 {
|
||||
t.Errorf("Expected off-diagonal to be 0.0, got %f", sm.Get(0, 1))
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewSimilarityMatrixWithLabels(t *testing.T) {
|
||||
labels := []string{"A", "B", "C"}
|
||||
sm := NewSimilarityMatrixWithLabels(labels)
|
||||
|
||||
if sm.Size() != 3 {
|
||||
t.Errorf("Expected size 3, got %d", sm.Size())
|
||||
}
|
||||
|
||||
// Check labels
|
||||
for i, label := range labels {
|
||||
if sm.GetLabel(i) != label {
|
||||
t.Errorf("Expected label %s at index %d, got %s", label, i, sm.GetLabel(i))
|
||||
}
|
||||
}
|
||||
|
||||
// Check diagonal is 1.0
|
||||
for i := 0; i < 3; i++ {
|
||||
if sm.Get(i, i) != 1.0 {
|
||||
t.Errorf("Expected diagonal element (%d, %d) to be 1.0, got %f", i, i, sm.Get(i, i))
|
||||
}
|
||||
}
|
||||
|
||||
// Set some similarities
|
||||
sm.Set(0, 1, 0.8)
|
||||
sm.Set(0, 2, 0.6)
|
||||
sm.Set(1, 2, 0.7)
|
||||
|
||||
// Check values
|
||||
if math.Abs(sm.Get(0, 1)-0.8) > 1e-10 {
|
||||
t.Errorf("Expected 0.8 at (0, 1), got %f", sm.Get(0, 1))
|
||||
}
|
||||
|
||||
if math.Abs(sm.Get(1, 0)-0.8) > 1e-10 {
|
||||
t.Errorf("Expected 0.8 at (1, 0) (symmetry), got %f", sm.Get(1, 0))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSimilarityMatrixCopy(t *testing.T) {
|
||||
sm := NewSimilarityMatrix(3)
|
||||
sm.Set(0, 1, 0.9)
|
||||
sm.Set(0, 2, 0.7)
|
||||
|
||||
smCopy := sm.Copy()
|
||||
|
||||
// Check diagonal is preserved
|
||||
if smCopy.Get(0, 0) != 1.0 {
|
||||
t.Errorf("Copied similarity matrix should have diagonal 1.0, got %f", smCopy.Get(0, 0))
|
||||
}
|
||||
|
||||
// Check values are preserved
|
||||
if math.Abs(smCopy.Get(0, 1)-0.9) > 1e-10 {
|
||||
t.Errorf("Copy should preserve values, expected 0.9, got %f", smCopy.Get(0, 1))
|
||||
}
|
||||
|
||||
// Modify copy and ensure original unchanged
|
||||
smCopy.Set(0, 1, 0.5)
|
||||
if math.Abs(sm.Get(0, 1)-0.9) > 1e-10 {
|
||||
t.Errorf("Modifying copy should not affect original, expected 0.9, got %f", sm.Get(0, 1))
|
||||
}
|
||||
}
|
||||
@@ -158,6 +158,54 @@ func (ks *KmerSet) Difference(other *KmerSet) *KmerSet {
|
||||
return NewKmerSetFromBitmap(ks.k, result)
|
||||
}
|
||||
|
||||
// JaccardDistance computes the Jaccard distance between two KmerSets.
|
||||
// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A ∪ B|)
|
||||
// where A and B are the two sets.
|
||||
//
|
||||
// Returns:
|
||||
// - 0.0 when sets are identical (distance = 0, similarity = 1)
|
||||
// - 1.0 when sets are completely disjoint (distance = 1, similarity = 0)
|
||||
// - 1.0 when both sets are empty (by convention)
|
||||
//
|
||||
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
|
||||
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
|
||||
func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 {
|
||||
if ks.k != other.k {
|
||||
panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k))
|
||||
}
|
||||
|
||||
// Compute intersection cardinality
|
||||
intersectionCard := ks.bitmap.AndCardinality(other.bitmap)
|
||||
|
||||
// Compute union cardinality
|
||||
unionCard := ks.bitmap.OrCardinality(other.bitmap)
|
||||
|
||||
// If union is empty, both sets are empty - return 1.0 by convention
|
||||
if unionCard == 0 {
|
||||
return 1.0
|
||||
}
|
||||
|
||||
// Jaccard similarity = |A ∩ B| / |A ∪ B|
|
||||
similarity := float64(intersectionCard) / float64(unionCard)
|
||||
|
||||
// Jaccard distance = 1 - similarity
|
||||
return 1.0 - similarity
|
||||
}
|
||||
|
||||
// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets.
|
||||
// The Jaccard similarity is defined as: |A ∩ B| / |A ∪ B|
|
||||
//
|
||||
// Returns:
|
||||
// - 1.0 when sets are identical (maximum similarity)
|
||||
// - 0.0 when sets are completely disjoint (no similarity)
|
||||
// - 0.0 when both sets are empty (by convention)
|
||||
//
|
||||
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
|
||||
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
|
||||
func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 {
|
||||
return 1.0 - ks.JaccardDistance(other)
|
||||
}
|
||||
|
||||
// Iterator returns an iterator over all k-mers in the set
|
||||
func (ks *KmerSet) Iterator() roaring64.IntIterable64 {
|
||||
return ks.bitmap.Iterator()
|
||||
|
||||
@@ -3,6 +3,7 @@ package obikmer
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
@@ -260,3 +261,79 @@ Set breakdown:
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix for all KmerSets in the group.
|
||||
// Returns a triangular distance matrix where element (i, j) represents the Jaccard distance
|
||||
// between set i and set j.
|
||||
//
|
||||
// The Jaccard distance is: 1 - (|A ∩ B| / |A ∪ B|)
|
||||
//
|
||||
// The matrix labels are set to the IDs of the individual KmerSets if available,
|
||||
// otherwise they are set to "set_0", "set_1", etc.
|
||||
//
|
||||
// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
|
||||
// Space complexity: O(n²) for the distance matrix
|
||||
func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix {
|
||||
n := len(ksg.sets)
|
||||
|
||||
// Create labels from set IDs
|
||||
labels := make([]string, n)
|
||||
for i, ks := range ksg.sets {
|
||||
if ks.Id() != "" {
|
||||
labels[i] = ks.Id()
|
||||
} else {
|
||||
labels[i] = fmt.Sprintf("set_%d", i)
|
||||
}
|
||||
}
|
||||
|
||||
dm := obidist.NewDistMatrixWithLabels(labels)
|
||||
|
||||
// Compute pairwise distances
|
||||
for i := 0; i < n-1; i++ {
|
||||
for j := i + 1; j < n; j++ {
|
||||
distance := ksg.sets[i].JaccardDistance(ksg.sets[j])
|
||||
dm.Set(i, j, distance)
|
||||
}
|
||||
}
|
||||
|
||||
return dm
|
||||
}
|
||||
|
||||
// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix for all KmerSets in the group.
|
||||
// Returns a similarity matrix where element (i, j) represents the Jaccard similarity
|
||||
// between set i and set j.
|
||||
//
|
||||
// The Jaccard similarity is: |A ∩ B| / |A ∪ B|
|
||||
//
|
||||
// The diagonal is 1.0 (similarity of a set to itself).
|
||||
//
|
||||
// The matrix labels are set to the IDs of the individual KmerSets if available,
|
||||
// otherwise they are set to "set_0", "set_1", etc.
|
||||
//
|
||||
// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
|
||||
// Space complexity: O(n²) for the similarity matrix
|
||||
func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
|
||||
n := len(ksg.sets)
|
||||
|
||||
// Create labels from set IDs
|
||||
labels := make([]string, n)
|
||||
for i, ks := range ksg.sets {
|
||||
if ks.Id() != "" {
|
||||
labels[i] = ks.Id()
|
||||
} else {
|
||||
labels[i] = fmt.Sprintf("set_%d", i)
|
||||
}
|
||||
}
|
||||
|
||||
sm := obidist.NewSimilarityMatrixWithLabels(labels)
|
||||
|
||||
// Compute pairwise similarities
|
||||
for i := 0; i < n-1; i++ {
|
||||
for j := i + 1; j < n; j++ {
|
||||
similarity := ksg.sets[i].JaccardSimilarity(ksg.sets[j])
|
||||
sm.Set(i, j, similarity)
|
||||
}
|
||||
}
|
||||
|
||||
return sm
|
||||
}
|
||||
|
||||
231
pkg/obikmer/kmer_set_group_jaccard_test.go
Normal file
231
pkg/obikmer/kmer_set_group_jaccard_test.go
Normal file
@@ -0,0 +1,231 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestKmerSetGroupJaccardDistanceMatrix(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Set 0: {1, 2, 3}
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
ksg.Get(0).AddKmerCode(3)
|
||||
ksg.Get(0).SetId("set_A")
|
||||
|
||||
// Set 1: {2, 3, 4}
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
ksg.Get(1).AddKmerCode(4)
|
||||
ksg.Get(1).SetId("set_B")
|
||||
|
||||
// Set 2: {5, 6, 7}
|
||||
ksg.Get(2).AddKmerCode(5)
|
||||
ksg.Get(2).AddKmerCode(6)
|
||||
ksg.Get(2).AddKmerCode(7)
|
||||
ksg.Get(2).SetId("set_C")
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
// Check labels
|
||||
if dm.GetLabel(0) != "set_A" {
|
||||
t.Errorf("Expected label 'set_A' at index 0, got '%s'", dm.GetLabel(0))
|
||||
}
|
||||
if dm.GetLabel(1) != "set_B" {
|
||||
t.Errorf("Expected label 'set_B' at index 1, got '%s'", dm.GetLabel(1))
|
||||
}
|
||||
if dm.GetLabel(2) != "set_C" {
|
||||
t.Errorf("Expected label 'set_C' at index 2, got '%s'", dm.GetLabel(2))
|
||||
}
|
||||
|
||||
// Check distances
|
||||
// Distance(0, 1):
|
||||
// Intersection: {2, 3} -> 2 elements
|
||||
// Union: {1, 2, 3, 4} -> 4 elements
|
||||
// Similarity: 2/4 = 0.5
|
||||
// Distance: 1 - 0.5 = 0.5
|
||||
expectedDist01 := 0.5
|
||||
actualDist01 := dm.Get(0, 1)
|
||||
if math.Abs(actualDist01-expectedDist01) > 1e-10 {
|
||||
t.Errorf("Distance(0, 1): expected %f, got %f", expectedDist01, actualDist01)
|
||||
}
|
||||
|
||||
// Distance(0, 2):
|
||||
// Intersection: {} -> 0 elements
|
||||
// Union: {1, 2, 3, 5, 6, 7} -> 6 elements
|
||||
// Similarity: 0/6 = 0
|
||||
// Distance: 1 - 0 = 1.0
|
||||
expectedDist02 := 1.0
|
||||
actualDist02 := dm.Get(0, 2)
|
||||
if math.Abs(actualDist02-expectedDist02) > 1e-10 {
|
||||
t.Errorf("Distance(0, 2): expected %f, got %f", expectedDist02, actualDist02)
|
||||
}
|
||||
|
||||
// Distance(1, 2):
|
||||
// Intersection: {} -> 0 elements
|
||||
// Union: {2, 3, 4, 5, 6, 7} -> 6 elements
|
||||
// Similarity: 0/6 = 0
|
||||
// Distance: 1 - 0 = 1.0
|
||||
expectedDist12 := 1.0
|
||||
actualDist12 := dm.Get(1, 2)
|
||||
if math.Abs(actualDist12-expectedDist12) > 1e-10 {
|
||||
t.Errorf("Distance(1, 2): expected %f, got %f", expectedDist12, actualDist12)
|
||||
}
|
||||
|
||||
// Check symmetry
|
||||
if dm.Get(0, 1) != dm.Get(1, 0) {
|
||||
t.Errorf("Matrix not symmetric: Get(0, 1) = %f, Get(1, 0) = %f",
|
||||
dm.Get(0, 1), dm.Get(1, 0))
|
||||
}
|
||||
|
||||
// Check diagonal
|
||||
if dm.Get(0, 0) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(0, 0))
|
||||
}
|
||||
if dm.Get(1, 1) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(1, 1))
|
||||
}
|
||||
if dm.Get(2, 2) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(2, 2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardSimilarityMatrix(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Set 0: {1, 2, 3}
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
ksg.Get(0).AddKmerCode(3)
|
||||
|
||||
// Set 1: {2, 3, 4}
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
ksg.Get(1).AddKmerCode(4)
|
||||
|
||||
// Set 2: {1, 2, 3} (same as set 0)
|
||||
ksg.Get(2).AddKmerCode(1)
|
||||
ksg.Get(2).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
sm := ksg.JaccardSimilarityMatrix()
|
||||
|
||||
// Check similarities
|
||||
// Similarity(0, 1): 0.5 (as calculated above)
|
||||
expectedSim01 := 0.5
|
||||
actualSim01 := sm.Get(0, 1)
|
||||
if math.Abs(actualSim01-expectedSim01) > 1e-10 {
|
||||
t.Errorf("Similarity(0, 1): expected %f, got %f", expectedSim01, actualSim01)
|
||||
}
|
||||
|
||||
// Similarity(0, 2): 1.0 (identical sets)
|
||||
expectedSim02 := 1.0
|
||||
actualSim02 := sm.Get(0, 2)
|
||||
if math.Abs(actualSim02-expectedSim02) > 1e-10 {
|
||||
t.Errorf("Similarity(0, 2): expected %f, got %f", expectedSim02, actualSim02)
|
||||
}
|
||||
|
||||
// Similarity(1, 2): 0.5
|
||||
// Intersection: {2, 3} -> 2
|
||||
// Union: {1, 2, 3, 4} -> 4
|
||||
// Similarity: 2/4 = 0.5
|
||||
expectedSim12 := 0.5
|
||||
actualSim12 := sm.Get(1, 2)
|
||||
if math.Abs(actualSim12-expectedSim12) > 1e-10 {
|
||||
t.Errorf("Similarity(1, 2): expected %f, got %f", expectedSim12, actualSim12)
|
||||
}
|
||||
|
||||
// Check diagonal (similarity to self = 1.0)
|
||||
if sm.Get(0, 0) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(0, 0))
|
||||
}
|
||||
if sm.Get(1, 1) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(1, 1))
|
||||
}
|
||||
if sm.Get(2, 2) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(2, 2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatricesRelation(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 4)
|
||||
|
||||
// Create different sets
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
|
||||
ksg.Get(2).AddKmerCode(1)
|
||||
ksg.Get(2).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
ksg.Get(3).AddKmerCode(10)
|
||||
ksg.Get(3).AddKmerCode(20)
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
sm := ksg.JaccardSimilarityMatrix()
|
||||
|
||||
// For all pairs (including diagonal), distance + similarity should equal 1.0
|
||||
for i := 0; i < 4; i++ {
|
||||
for j := 0; j < 4; j++ {
|
||||
distance := dm.Get(i, j)
|
||||
similarity := sm.Get(i, j)
|
||||
sum := distance + similarity
|
||||
|
||||
if math.Abs(sum-1.0) > 1e-10 {
|
||||
t.Errorf("At (%d, %d): distance %f + similarity %f = %f, expected 1.0",
|
||||
i, j, distance, similarity, sum)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatrixLabels(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Don't set IDs - should use default labels
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
// Check default labels
|
||||
if dm.GetLabel(0) != "set_0" {
|
||||
t.Errorf("Expected default label 'set_0', got '%s'", dm.GetLabel(0))
|
||||
}
|
||||
if dm.GetLabel(1) != "set_1" {
|
||||
t.Errorf("Expected default label 'set_1', got '%s'", dm.GetLabel(1))
|
||||
}
|
||||
if dm.GetLabel(2) != "set_2" {
|
||||
t.Errorf("Expected default label 'set_2', got '%s'", dm.GetLabel(2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatrixSize(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 5)
|
||||
|
||||
for i := 0; i < 5; i++ {
|
||||
ksg.Get(i).AddKmerCode(uint64(i))
|
||||
}
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
if dm.Size() != 5 {
|
||||
t.Errorf("Expected matrix size 5, got %d", dm.Size())
|
||||
}
|
||||
|
||||
// All sets are disjoint, so all distances should be 1.0
|
||||
for i := 0; i < 5; i++ {
|
||||
for j := i + 1; j < 5; j++ {
|
||||
dist := dm.Get(i, j)
|
||||
if math.Abs(dist-1.0) > 1e-10 {
|
||||
t.Errorf("Expected distance 1.0 for disjoint sets (%d, %d), got %f",
|
||||
i, j, dist)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
272
pkg/obikmer/kmer_set_test.go
Normal file
272
pkg/obikmer/kmer_set_test.go
Normal file
@@ -0,0 +1,272 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestJaccardDistanceIdentical(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(100)
|
||||
ks1.AddKmerCode(200)
|
||||
ks1.AddKmerCode(300)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(100)
|
||||
ks2.AddKmerCode(200)
|
||||
ks2.AddKmerCode(300)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
if distance != 0.0 {
|
||||
t.Errorf("Expected distance 0.0 for identical sets, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 1.0 {
|
||||
t.Errorf("Expected similarity 1.0 for identical sets, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceDisjoint(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(100)
|
||||
ks1.AddKmerCode(200)
|
||||
ks1.AddKmerCode(300)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(400)
|
||||
ks2.AddKmerCode(500)
|
||||
ks2.AddKmerCode(600)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
if distance != 1.0 {
|
||||
t.Errorf("Expected distance 1.0 for disjoint sets, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 0.0 {
|
||||
t.Errorf("Expected similarity 0.0 for disjoint sets, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistancePartialOverlap(t *testing.T) {
|
||||
// Set 1: {1, 2, 3}
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
ks1.AddKmerCode(3)
|
||||
|
||||
// Set 2: {2, 3, 4}
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(2)
|
||||
ks2.AddKmerCode(3)
|
||||
ks2.AddKmerCode(4)
|
||||
|
||||
// Intersection: {2, 3} -> cardinality = 2
|
||||
// Union: {1, 2, 3, 4} -> cardinality = 4
|
||||
// Similarity = 2/4 = 0.5
|
||||
// Distance = 1 - 0.5 = 0.5
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
expectedDistance := 0.5
|
||||
expectedSimilarity := 0.5
|
||||
|
||||
if math.Abs(distance-expectedDistance) > 1e-10 {
|
||||
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
|
||||
}
|
||||
|
||||
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
|
||||
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceOneSubsetOfOther(t *testing.T) {
|
||||
// Set 1: {1, 2}
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
|
||||
// Set 2: {1, 2, 3, 4}
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(1)
|
||||
ks2.AddKmerCode(2)
|
||||
ks2.AddKmerCode(3)
|
||||
ks2.AddKmerCode(4)
|
||||
|
||||
// Intersection: {1, 2} -> cardinality = 2
|
||||
// Union: {1, 2, 3, 4} -> cardinality = 4
|
||||
// Similarity = 2/4 = 0.5
|
||||
// Distance = 1 - 0.5 = 0.5
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
expectedDistance := 0.5
|
||||
expectedSimilarity := 0.5
|
||||
|
||||
if math.Abs(distance-expectedDistance) > 1e-10 {
|
||||
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
|
||||
}
|
||||
|
||||
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
|
||||
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceEmptySets(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks2 := NewKmerSet(5)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
// By convention, distance = 1.0 for empty sets
|
||||
if distance != 1.0 {
|
||||
t.Errorf("Expected distance 1.0 for empty sets, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 0.0 {
|
||||
t.Errorf("Expected similarity 0.0 for empty sets, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceOneEmpty(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
ks1.AddKmerCode(3)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
// Intersection: {} -> cardinality = 0
|
||||
// Union: {1, 2, 3} -> cardinality = 3
|
||||
// Similarity = 0/3 = 0.0
|
||||
// Distance = 1.0
|
||||
|
||||
if distance != 1.0 {
|
||||
t.Errorf("Expected distance 1.0 when one set is empty, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 0.0 {
|
||||
t.Errorf("Expected similarity 0.0 when one set is empty, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceDifferentK(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
|
||||
ks2 := NewKmerSet(7)
|
||||
ks2.AddKmerCode(1)
|
||||
|
||||
defer func() {
|
||||
if r := recover(); r == nil {
|
||||
t.Errorf("Expected panic when computing Jaccard distance with different k values")
|
||||
}
|
||||
}()
|
||||
|
||||
_ = ks1.JaccardDistance(ks2)
|
||||
}
|
||||
|
||||
func TestJaccardDistanceSimilarityRelation(t *testing.T) {
|
||||
// Test that distance + similarity = 1.0 for all cases
|
||||
testCases := []struct {
|
||||
name string
|
||||
ks1 *KmerSet
|
||||
ks2 *KmerSet
|
||||
}{
|
||||
{
|
||||
name: "partial overlap",
|
||||
ks1: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(1)
|
||||
ks.AddKmerCode(2)
|
||||
ks.AddKmerCode(3)
|
||||
return ks
|
||||
}(),
|
||||
ks2: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(2)
|
||||
ks.AddKmerCode(3)
|
||||
ks.AddKmerCode(4)
|
||||
ks.AddKmerCode(5)
|
||||
return ks
|
||||
}(),
|
||||
},
|
||||
{
|
||||
name: "identical",
|
||||
ks1: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(10)
|
||||
ks.AddKmerCode(20)
|
||||
return ks
|
||||
}(),
|
||||
ks2: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(10)
|
||||
ks.AddKmerCode(20)
|
||||
return ks
|
||||
}(),
|
||||
},
|
||||
{
|
||||
name: "disjoint",
|
||||
ks1: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(1)
|
||||
return ks
|
||||
}(),
|
||||
ks2: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(100)
|
||||
return ks
|
||||
}(),
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
distance := tc.ks1.JaccardDistance(tc.ks2)
|
||||
similarity := tc.ks1.JaccardSimilarity(tc.ks2)
|
||||
|
||||
sum := distance + similarity
|
||||
|
||||
if math.Abs(sum-1.0) > 1e-10 {
|
||||
t.Errorf("Expected distance + similarity = 1.0, got %f + %f = %f",
|
||||
distance, similarity, sum)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceSymmetry(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
ks1.AddKmerCode(3)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(2)
|
||||
ks2.AddKmerCode(3)
|
||||
ks2.AddKmerCode(4)
|
||||
|
||||
distance1 := ks1.JaccardDistance(ks2)
|
||||
distance2 := ks2.JaccardDistance(ks1)
|
||||
|
||||
similarity1 := ks1.JaccardSimilarity(ks2)
|
||||
similarity2 := ks2.JaccardSimilarity(ks1)
|
||||
|
||||
if math.Abs(distance1-distance2) > 1e-10 {
|
||||
t.Errorf("Jaccard distance not symmetric: %f vs %f", distance1, distance2)
|
||||
}
|
||||
|
||||
if math.Abs(similarity1-similarity2) > 1e-10 {
|
||||
t.Errorf("Jaccard similarity not symmetric: %f vs %f", similarity1, similarity2)
|
||||
}
|
||||
}
|
||||
@@ -1,20 +1,14 @@
|
||||
package obioptions
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
// Version is automatically updated by the Makefile from version.txt
|
||||
// The patch number (third digit) is incremented on each push to the repository
|
||||
|
||||
// TODO: The version number is extracted from git. This induces that the version
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
|
||||
var _Commit = "a43e625"
|
||||
var _Version = "Release 4.4.0"
|
||||
var _Version = "Release 4.4.3"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns a string representing the version of the obitools package.
|
||||
func VersionString() string {
|
||||
return fmt.Sprintf("%s (%s)", _Version, _Commit)
|
||||
return _Version
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user