Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup

Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup

This commit introduces Jaccard distance and similarity methods for KmerSet and KmerSetGroup.

For KmerSet:
- Added JaccardDistance method to compute the Jaccard distance between two KmerSets
- Added JaccardSimilarity method to compute the Jaccard similarity between two KmerSets

For KmerSetGroup:
- Added JaccardDistanceMatrix method to compute a pairwise Jaccard distance matrix
- Added JaccardSimilarityMatrix method to compute a pairwise Jaccard similarity matrix

Also includes:
- New DistMatrix implementation in pkg/obidist for storing and computing distance/similarity matrices
- Updated version handling with bump-version target in Makefile
- Added tests for all new methods
This commit is contained in:
Eric Coissac
2026-02-05 17:38:47 +01:00
parent aa2e94dd6f
commit e3c41fc11b
9 changed files with 1328 additions and 28 deletions

View File

@@ -62,10 +62,10 @@ OUTPUT:=$(shell mktemp)
all: install-githook obitools
obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS))
obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS))
install-githook: $(GITHOOKS)
$(GITHOOK_DIR)/%: $(GITHOOK_SRC_DIR)/%
@echo installing $$(basename $@)...
@mkdir -p $(GITHOOK_DIR)
@@ -79,10 +79,10 @@ update-deps:
test: .FORCE
$(GOTEST) ./...
obitests:
obitests:
@for t in $$(find obitests -name test.sh -print) ; do \
bash $${t} || exit 1;\
done
done
githubtests: obitools obitests
@@ -94,19 +94,30 @@ $(foreach P,$(PACKAGE_DIRS),$(eval $(call MAKE_PKG_RULE,$(P))))
$(foreach P,$(OBITOOLS_DIRS),$(eval $(call MAKE_OBITOOLS_RULE,$(P))))
pkg/obioptions/version.go: .FORCE
ifneq ($(strip $(COMMIT_ID)),)
@cat $@ \
| sed -E 's/^var _Commit = "[^"]*"/var _Commit = "'$(COMMIT_ID)'"/' \
| sed -E 's/^var _Version = "[^"]*"/var _Version = "'"$(LAST_TAG)"'"/' \
pkg/obioptions/version.go: version.txt .FORCE
@version=$$(cat version.txt); \
cat $@ \
| sed -E 's/^var _Version = "[^"]*"/var _Version = "Release '$$version'"/' \
> $(OUTPUT)
@diff $@ $(OUTPUT) 2>&1 > /dev/null \
|| echo "Update version.go : $@ to $(LAST_TAG) ($(COMMIT_ID))" \
&& mv $(OUTPUT) $@
|| (echo "Update version.go to $$(cat version.txt)" && mv $(OUTPUT) $@)
@rm -f $(OUTPUT)
endif
bump-version:
@echo "Incrementing version..."
@current=$$(cat version.txt); \
echo " Current version: $$current"; \
major=$$(echo $$current | cut -d. -f1); \
minor=$$(echo $$current | cut -d. -f2); \
patch=$$(echo $$current | cut -d. -f3); \
new_patch=$$((patch + 1)); \
new_version="$$major.$$minor.$$new_patch"; \
echo " New version: $$new_version"; \
echo "$$new_version" > version.txt
@echo "✓ Version updated in version.txt"
@$(MAKE) pkg/obioptions/version.go
jjnew:
@echo "$(YELLOW)→ Creating a new commit...$(NC)"
@@ -116,13 +127,21 @@ jjnew:
@jj new
@echo "$(GREEN)✓ New commit created$(NC)"
jjpush:
jjpush:
@echo "$(YELLOW)→ Pushing commit to repository...$(NC)"
@echo "$(BLUE)→ Documenting current commit...$(NC)"
@jj auto-describe
@echo "$(BLUE)Done.$(NC)"
@jj git push --change @
@echo "$(GREEN)✓ Commit pushed to repository$(NC)"
@echo "$(BLUE)Creating new commit for version bump...$(NC)"
@jj new
@$(MAKE) bump-version
@echo "$(BLUE)→ Documenting version bump commit...$(NC)"
@jj auto-describe
@version=$$(cat version.txt); \
echo "$(BLUE)→ Pushing commits and creating tag v$$version...$(NC)"; \
jj git push --change @; \
git tag -a "v$$version" -m "Release $$version" 2>/dev/null || echo "Tag v$$version already exists"; \
git push origin "v$$version" 2>/dev/null || echo "Tag already pushed"
@echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)"
jjfetch:
@echo "$(YELLOW)→ Pulling latest commits...$(NC)"
@@ -130,5 +149,5 @@ jjfetch:
@jj new master@origin
@echo "$(GREEN)✓ Latest commits pulled$(NC)"
.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch .FORCE
.FORCE:
.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch bump-version .FORCE
.FORCE:

272
pkg/obidist/dist_matrix.go Normal file
View File

@@ -0,0 +1,272 @@
package obidist
import (
"fmt"
)
// DistMatrix represents a symmetric matrix stored as a triangular matrix.
// The diagonal has a constant value (typically 0 for distances, 1 for similarities).
// Only the upper triangle (i < j) is stored to save memory.
//
// For an n×n matrix, we store n(n-1)/2 values.
type DistMatrix struct {
n int // Number of elements (matrix dimension)
data []float64 // Triangular storage: upper triangle only
labels []string // Optional labels for rows/columns
diagonalValue float64 // Value on the diagonal
}
// NewDistMatrix creates a new distance matrix of size n×n.
// All distances are initialized to 0.0, diagonal is 0.0.
func NewDistMatrix(n int) *DistMatrix {
if n < 0 {
panic("matrix size must be non-negative")
}
// Number of elements in upper triangle: n(n-1)/2
size := n * (n - 1) / 2
return &DistMatrix{
n: n,
data: make([]float64, size),
labels: make([]string, n),
diagonalValue: 0.0,
}
}
// NewDistMatrixWithLabels creates a new distance matrix with labels.
// Diagonal is 0.0 by default.
func NewDistMatrixWithLabels(labels []string) *DistMatrix {
dm := NewDistMatrix(len(labels))
copy(dm.labels, labels)
return dm
}
// NewSimilarityMatrix creates a new similarity matrix of size n×n.
// All off-diagonal values are initialized to 0.0, diagonal is 1.0.
func NewSimilarityMatrix(n int) *DistMatrix {
if n < 0 {
panic("matrix size must be non-negative")
}
// Number of elements in upper triangle: n(n-1)/2
size := n * (n - 1) / 2
return &DistMatrix{
n: n,
data: make([]float64, size),
labels: make([]string, n),
diagonalValue: 1.0,
}
}
// NewSimilarityMatrixWithLabels creates a new similarity matrix with labels.
// Diagonal is 1.0.
func NewSimilarityMatrixWithLabels(labels []string) *DistMatrix {
dm := NewSimilarityMatrix(len(labels))
copy(dm.labels, labels)
return dm
}
// Size returns the dimension of the matrix (n for an n×n matrix).
func (dm *DistMatrix) Size() int {
return dm.n
}
// indexFor computes the index in the data array for element (i, j).
// Assumes i < j (upper triangle).
//
// The upper triangle is stored row by row:
// (0,1), (0,2), ..., (0,n-1), (1,2), (1,3), ..., (1,n-1), (2,3), ...
//
// For element (i, j) where i < j:
// index = i*(n-1) + j - 1 - i*(i+1)/2
//
// This can be simplified to:
// index = i*n - i*(i+1)/2 + j - i - 1
// = i*(n - (i+1)/2 - 1) + j - 1
// = i*(n - 1 - i/2 - 1/2) + j - 1
//
// But the clearest formula is:
// index = i*n - i*(i+3)/2 + j - 1
func (dm *DistMatrix) indexFor(i, j int) int {
if i >= j {
panic(fmt.Sprintf("indexFor expects i < j, got i=%d, j=%d", i, j))
}
// Formula: number of elements in previous rows + position in current row
// Previous rows (0 to i-1): sum from k=0 to i-1 of (n-1-k) = i*n - i*(i+1)/2
// Current row position: j - i - 1
return i*dm.n - i*(i+1)/2 + j - i - 1
}
// Get returns the value at position (i, j).
// The matrix is symmetric, so Get(i, j) == Get(j, i).
// The diagonal returns the diagonalValue (0.0 for distances, 1.0 for similarities).
func (dm *DistMatrix) Get(i, j int) float64 {
if i < 0 || i >= dm.n || j < 0 || j >= dm.n {
panic(fmt.Sprintf("indices out of bounds: i=%d, j=%d, n=%d", i, j, dm.n))
}
// Diagonal: return the diagonal value
if i == j {
return dm.diagonalValue
}
// Ensure i < j for indexing
if i > j {
i, j = j, i
}
return dm.data[dm.indexFor(i, j)]
}
// Set sets the value at position (i, j).
// The matrix is symmetric, so Set(i, j, v) also sets (j, i) to v.
// Setting the diagonal (i == j) is ignored (diagonal has a fixed value).
func (dm *DistMatrix) Set(i, j int, value float64) {
if i < 0 || i >= dm.n || j < 0 || j >= dm.n {
panic(fmt.Sprintf("indices out of bounds: i=%d, j=%d, n=%d", i, j, dm.n))
}
// Ignore diagonal assignments (diagonal has a fixed value)
if i == j {
return
}
// Ensure i < j for indexing
if i > j {
i, j = j, i
}
dm.data[dm.indexFor(i, j)] = value
}
// GetLabel returns the label for element i.
func (dm *DistMatrix) GetLabel(i int) string {
if i < 0 || i >= dm.n {
panic(fmt.Sprintf("index out of bounds: i=%d, n=%d", i, dm.n))
}
return dm.labels[i]
}
// SetLabel sets the label for element i.
func (dm *DistMatrix) SetLabel(i int, label string) {
if i < 0 || i >= dm.n {
panic(fmt.Sprintf("index out of bounds: i=%d, n=%d", i, dm.n))
}
dm.labels[i] = label
}
// Labels returns a copy of all labels.
func (dm *DistMatrix) Labels() []string {
labels := make([]string, dm.n)
copy(labels, dm.labels)
return labels
}
// GetRow returns the i-th row of the distance matrix.
// The returned slice is a copy.
func (dm *DistMatrix) GetRow(i int) []float64 {
if i < 0 || i >= dm.n {
panic(fmt.Sprintf("index out of bounds: i=%d, n=%d", i, dm.n))
}
row := make([]float64, dm.n)
for j := 0; j < dm.n; j++ {
row[j] = dm.Get(i, j)
}
return row
}
// GetColumn returns the j-th column of the distance matrix.
// Since the matrix is symmetric, GetColumn(j) == GetRow(j).
// The returned slice is a copy.
func (dm *DistMatrix) GetColumn(j int) []float64 {
return dm.GetRow(j)
}
// MinDistance returns the minimum non-zero distance in the matrix,
// along with the indices (i, j) where it occurs.
// Returns (0.0, -1, -1) if the matrix is empty or all distances are 0.
func (dm *DistMatrix) MinDistance() (float64, int, int) {
if dm.n <= 1 {
return 0.0, -1, -1
}
minDist := -1.0
minI, minJ := -1, -1
for i := 0; i < dm.n-1; i++ {
for j := i + 1; j < dm.n; j++ {
dist := dm.Get(i, j)
if minDist < 0 || dist < minDist {
minDist = dist
minI = i
minJ = j
}
}
}
if minI < 0 {
return 0.0, -1, -1
}
return minDist, minI, minJ
}
// MaxDistance returns the maximum distance in the matrix,
// along with the indices (i, j) where it occurs.
// Returns (0.0, -1, -1) if the matrix is empty.
func (dm *DistMatrix) MaxDistance() (float64, int, int) {
if dm.n <= 1 {
return 0.0, -1, -1
}
maxDist := -1.0
maxI, maxJ := -1, -1
for i := 0; i < dm.n-1; i++ {
for j := i + 1; j < dm.n; j++ {
dist := dm.Get(i, j)
if maxDist < 0 || dist > maxDist {
maxDist = dist
maxI = i
maxJ = j
}
}
}
if maxI < 0 {
return 0.0, -1, -1
}
return maxDist, maxI, maxJ
}
// Copy creates a deep copy of the matrix.
func (dm *DistMatrix) Copy() *DistMatrix {
newDM := &DistMatrix{
n: dm.n,
data: make([]float64, len(dm.data)),
labels: make([]string, dm.n),
diagonalValue: dm.diagonalValue,
}
copy(newDM.data, dm.data)
copy(newDM.labels, dm.labels)
return newDM
}
// ToFullMatrix returns a full n×n matrix representation.
// This allocates n² values, so use only when needed.
func (dm *DistMatrix) ToFullMatrix() [][]float64 {
matrix := make([][]float64, dm.n)
for i := 0; i < dm.n; i++ {
matrix[i] = make([]float64, dm.n)
for j := 0; j < dm.n; j++ {
matrix[i][j] = dm.Get(i, j)
}
}
return matrix
}

View File

@@ -0,0 +1,386 @@
package obidist
import (
"math"
"testing"
)
func TestNewDistMatrix(t *testing.T) {
dm := NewDistMatrix(5)
if dm.Size() != 5 {
t.Errorf("Expected size 5, got %d", dm.Size())
}
// Check that all values are initialized to 0
for i := 0; i < 5; i++ {
for j := 0; j < 5; j++ {
if dm.Get(i, j) != 0.0 {
t.Errorf("Expected 0.0 at (%d, %d), got %f", i, j, dm.Get(i, j))
}
}
}
}
func TestDistMatrixDiagonal(t *testing.T) {
dm := NewDistMatrix(5)
// Diagonal should always be 0
for i := 0; i < 5; i++ {
if dm.Get(i, i) != 0.0 {
t.Errorf("Expected diagonal element (%d, %d) to be 0.0, got %f", i, i, dm.Get(i, i))
}
}
// Try to set diagonal (should be ignored)
dm.Set(2, 2, 5.0)
if dm.Get(2, 2) != 0.0 {
t.Errorf("Diagonal should remain 0.0 even after Set, got %f", dm.Get(2, 2))
}
}
func TestDistMatrixSymmetry(t *testing.T) {
dm := NewDistMatrix(4)
dm.Set(0, 1, 1.5)
dm.Set(0, 2, 2.5)
dm.Set(1, 3, 3.5)
// Check symmetry
if dm.Get(0, 1) != dm.Get(1, 0) {
t.Errorf("Matrix not symmetric: Get(0,1)=%f, Get(1,0)=%f", dm.Get(0, 1), dm.Get(1, 0))
}
if dm.Get(0, 2) != dm.Get(2, 0) {
t.Errorf("Matrix not symmetric: Get(0,2)=%f, Get(2,0)=%f", dm.Get(0, 2), dm.Get(2, 0))
}
if dm.Get(1, 3) != dm.Get(3, 1) {
t.Errorf("Matrix not symmetric: Get(1,3)=%f, Get(3,1)=%f", dm.Get(1, 3), dm.Get(3, 1))
}
}
func TestDistMatrixSetGet(t *testing.T) {
dm := NewDistMatrix(4)
testCases := []struct {
i int
j int
value float64
}{
{0, 1, 1.5},
{0, 2, 2.5},
{0, 3, 3.5},
{1, 2, 4.5},
{1, 3, 5.5},
{2, 3, 6.5},
}
for _, tc := range testCases {
dm.Set(tc.i, tc.j, tc.value)
}
for _, tc := range testCases {
got := dm.Get(tc.i, tc.j)
if math.Abs(got-tc.value) > 1e-10 {
t.Errorf("Get(%d, %d): expected %f, got %f", tc.i, tc.j, tc.value, got)
}
// Check symmetry
got = dm.Get(tc.j, tc.i)
if math.Abs(got-tc.value) > 1e-10 {
t.Errorf("Get(%d, %d) (symmetric): expected %f, got %f", tc.j, tc.i, tc.value, got)
}
}
}
func TestDistMatrixLabels(t *testing.T) {
labels := []string{"A", "B", "C", "D"}
dm := NewDistMatrixWithLabels(labels)
if dm.Size() != 4 {
t.Errorf("Expected size 4, got %d", dm.Size())
}
for i, label := range labels {
if dm.GetLabel(i) != label {
t.Errorf("Expected label %s at index %d, got %s", label, i, dm.GetLabel(i))
}
}
// Modify a label
dm.SetLabel(1, "Modified")
if dm.GetLabel(1) != "Modified" {
t.Errorf("Expected label 'Modified' at index 1, got %s", dm.GetLabel(1))
}
// Check Labels() returns a copy
labelsCopy := dm.Labels()
labelsCopy[0] = "ChangedCopy"
if dm.GetLabel(0) != "A" {
t.Errorf("Modifying Labels() return value should not affect original labels")
}
}
func TestDistMatrixMinDistance(t *testing.T) {
dm := NewDistMatrix(4)
dm.Set(0, 1, 2.5)
dm.Set(0, 2, 1.5) // minimum
dm.Set(0, 3, 3.5)
dm.Set(1, 2, 4.5)
dm.Set(1, 3, 5.5)
dm.Set(2, 3, 6.5)
minDist, minI, minJ := dm.MinDistance()
if math.Abs(minDist-1.5) > 1e-10 {
t.Errorf("Expected min distance 1.5, got %f", minDist)
}
if (minI != 0 || minJ != 2) && (minI != 2 || minJ != 0) {
t.Errorf("Expected min at (0, 2) or (2, 0), got (%d, %d)", minI, minJ)
}
}
func TestDistMatrixMaxDistance(t *testing.T) {
dm := NewDistMatrix(4)
dm.Set(0, 1, 2.5)
dm.Set(0, 2, 1.5)
dm.Set(0, 3, 3.5)
dm.Set(1, 2, 4.5)
dm.Set(1, 3, 5.5)
dm.Set(2, 3, 6.5) // maximum
maxDist, maxI, maxJ := dm.MaxDistance()
if math.Abs(maxDist-6.5) > 1e-10 {
t.Errorf("Expected max distance 6.5, got %f", maxDist)
}
if (maxI != 2 || maxJ != 3) && (maxI != 3 || maxJ != 2) {
t.Errorf("Expected max at (2, 3) or (3, 2), got (%d, %d)", maxI, maxJ)
}
}
func TestDistMatrixGetRow(t *testing.T) {
dm := NewDistMatrix(3)
dm.Set(0, 1, 1.0)
dm.Set(0, 2, 2.0)
dm.Set(1, 2, 3.0)
row0 := dm.GetRow(0)
expected0 := []float64{0.0, 1.0, 2.0}
for i, val := range expected0 {
if math.Abs(row0[i]-val) > 1e-10 {
t.Errorf("Row 0[%d]: expected %f, got %f", i, val, row0[i])
}
}
row1 := dm.GetRow(1)
expected1 := []float64{1.0, 0.0, 3.0}
for i, val := range expected1 {
if math.Abs(row1[i]-val) > 1e-10 {
t.Errorf("Row 1[%d]: expected %f, got %f", i, val, row1[i])
}
}
}
func TestDistMatrixCopy(t *testing.T) {
dm := NewDistMatrixWithLabels([]string{"A", "B", "C"})
dm.Set(0, 1, 1.5)
dm.Set(0, 2, 2.5)
dm.Set(1, 2, 3.5)
dmCopy := dm.Copy()
// Check values are copied
if dmCopy.Get(0, 1) != dm.Get(0, 1) {
t.Errorf("Copy has different value at (0, 1)")
}
// Check labels are copied
if dmCopy.GetLabel(0) != dm.GetLabel(0) {
t.Errorf("Copy has different label at index 0")
}
// Modify copy and ensure original unchanged
dmCopy.Set(0, 1, 99.9)
if dm.Get(0, 1) == 99.9 {
t.Errorf("Modifying copy affected original matrix")
}
dmCopy.SetLabel(0, "Modified")
if dm.GetLabel(0) == "Modified" {
t.Errorf("Modifying copy label affected original matrix")
}
}
func TestDistMatrixToFullMatrix(t *testing.T) {
dm := NewDistMatrix(3)
dm.Set(0, 1, 1.0)
dm.Set(0, 2, 2.0)
dm.Set(1, 2, 3.0)
full := dm.ToFullMatrix()
expected := [][]float64{
{0.0, 1.0, 2.0},
{1.0, 0.0, 3.0},
{2.0, 3.0, 0.0},
}
for i := 0; i < 3; i++ {
for j := 0; j < 3; j++ {
if math.Abs(full[i][j]-expected[i][j]) > 1e-10 {
t.Errorf("Full matrix[%d][%d]: expected %f, got %f",
i, j, expected[i][j], full[i][j])
}
}
}
}
func TestDistMatrixBoundsChecking(t *testing.T) {
dm := NewDistMatrix(3)
// Test Get out of bounds
testPanic := func(f func()) {
defer func() {
if r := recover(); r == nil {
t.Errorf("Expected panic, but didn't get one")
}
}()
f()
}
testPanic(func() { dm.Get(-1, 0) })
testPanic(func() { dm.Get(0, 3) })
testPanic(func() { dm.Set(3, 0, 1.0) })
testPanic(func() { dm.GetLabel(-1) })
testPanic(func() { dm.SetLabel(3, "Invalid") })
testPanic(func() { dm.GetRow(3) })
}
func TestDistMatrixEmptyMatrix(t *testing.T) {
dm := NewDistMatrix(0)
if dm.Size() != 0 {
t.Errorf("Expected size 0, got %d", dm.Size())
}
minDist, minI, minJ := dm.MinDistance()
if minDist != 0.0 || minI != -1 || minJ != -1 {
t.Errorf("Empty matrix MinDistance should return (0.0, -1, -1), got (%f, %d, %d)",
minDist, minI, minJ)
}
maxDist, maxI, maxJ := dm.MaxDistance()
if maxDist != 0.0 || maxI != -1 || maxJ != -1 {
t.Errorf("Empty matrix MaxDistance should return (0.0, -1, -1), got (%f, %d, %d)",
maxDist, maxI, maxJ)
}
}
func TestDistMatrixSingleElement(t *testing.T) {
dm := NewDistMatrix(1)
if dm.Size() != 1 {
t.Errorf("Expected size 1, got %d", dm.Size())
}
// Only element is diagonal (always 0)
if dm.Get(0, 0) != 0.0 {
t.Errorf("Expected 0.0 at (0, 0), got %f", dm.Get(0, 0))
}
minDist, minI, minJ := dm.MinDistance()
if minDist != 0.0 || minI != -1 || minJ != -1 {
t.Errorf("Single element matrix MinDistance should return (0.0, -1, -1), got (%f, %d, %d)",
minDist, minI, minJ)
}
}
func TestNewSimilarityMatrix(t *testing.T) {
sm := NewSimilarityMatrix(4)
if sm.Size() != 4 {
t.Errorf("Expected size 4, got %d", sm.Size())
}
// Check diagonal is 1.0
for i := 0; i < 4; i++ {
if sm.Get(i, i) != 1.0 {
t.Errorf("Expected diagonal element (%d, %d) to be 1.0, got %f", i, i, sm.Get(i, i))
}
}
// Check off-diagonal is 0.0
if sm.Get(0, 1) != 0.0 {
t.Errorf("Expected off-diagonal to be 0.0, got %f", sm.Get(0, 1))
}
}
func TestNewSimilarityMatrixWithLabels(t *testing.T) {
labels := []string{"A", "B", "C"}
sm := NewSimilarityMatrixWithLabels(labels)
if sm.Size() != 3 {
t.Errorf("Expected size 3, got %d", sm.Size())
}
// Check labels
for i, label := range labels {
if sm.GetLabel(i) != label {
t.Errorf("Expected label %s at index %d, got %s", label, i, sm.GetLabel(i))
}
}
// Check diagonal is 1.0
for i := 0; i < 3; i++ {
if sm.Get(i, i) != 1.0 {
t.Errorf("Expected diagonal element (%d, %d) to be 1.0, got %f", i, i, sm.Get(i, i))
}
}
// Set some similarities
sm.Set(0, 1, 0.8)
sm.Set(0, 2, 0.6)
sm.Set(1, 2, 0.7)
// Check values
if math.Abs(sm.Get(0, 1)-0.8) > 1e-10 {
t.Errorf("Expected 0.8 at (0, 1), got %f", sm.Get(0, 1))
}
if math.Abs(sm.Get(1, 0)-0.8) > 1e-10 {
t.Errorf("Expected 0.8 at (1, 0) (symmetry), got %f", sm.Get(1, 0))
}
}
func TestSimilarityMatrixCopy(t *testing.T) {
sm := NewSimilarityMatrix(3)
sm.Set(0, 1, 0.9)
sm.Set(0, 2, 0.7)
smCopy := sm.Copy()
// Check diagonal is preserved
if smCopy.Get(0, 0) != 1.0 {
t.Errorf("Copied similarity matrix should have diagonal 1.0, got %f", smCopy.Get(0, 0))
}
// Check values are preserved
if math.Abs(smCopy.Get(0, 1)-0.9) > 1e-10 {
t.Errorf("Copy should preserve values, expected 0.9, got %f", smCopy.Get(0, 1))
}
// Modify copy and ensure original unchanged
smCopy.Set(0, 1, 0.5)
if math.Abs(sm.Get(0, 1)-0.9) > 1e-10 {
t.Errorf("Modifying copy should not affect original, expected 0.9, got %f", sm.Get(0, 1))
}
}

View File

@@ -158,6 +158,54 @@ func (ks *KmerSet) Difference(other *KmerSet) *KmerSet {
return NewKmerSetFromBitmap(ks.k, result)
}
// JaccardDistance computes the Jaccard distance between two KmerSets.
// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A B|)
// where A and B are the two sets.
//
// Returns:
// - 0.0 when sets are identical (distance = 0, similarity = 1)
// - 1.0 when sets are completely disjoint (distance = 1, similarity = 0)
// - 1.0 when both sets are empty (by convention)
//
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 {
if ks.k != other.k {
panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k))
}
// Compute intersection cardinality
intersectionCard := ks.bitmap.AndCardinality(other.bitmap)
// Compute union cardinality
unionCard := ks.bitmap.OrCardinality(other.bitmap)
// If union is empty, both sets are empty - return 1.0 by convention
if unionCard == 0 {
return 1.0
}
// Jaccard similarity = |A ∩ B| / |A B|
similarity := float64(intersectionCard) / float64(unionCard)
// Jaccard distance = 1 - similarity
return 1.0 - similarity
}
// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets.
// The Jaccard similarity is defined as: |A ∩ B| / |A B|
//
// Returns:
// - 1.0 when sets are identical (maximum similarity)
// - 0.0 when sets are completely disjoint (no similarity)
// - 0.0 when both sets are empty (by convention)
//
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 {
return 1.0 - ks.JaccardDistance(other)
}
// Iterator returns an iterator over all k-mers in the set
func (ks *KmerSet) Iterator() roaring64.IntIterable64 {
return ks.bitmap.Iterator()

View File

@@ -3,6 +3,7 @@ package obikmer
import (
"fmt"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
@@ -260,3 +261,79 @@ Set breakdown:
return result
}
// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix for all KmerSets in the group.
// Returns a triangular distance matrix where element (i, j) represents the Jaccard distance
// between set i and set j.
//
// The Jaccard distance is: 1 - (|A ∩ B| / |A B|)
//
// The matrix labels are set to the IDs of the individual KmerSets if available,
// otherwise they are set to "set_0", "set_1", etc.
//
// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
// Space complexity: O(n²) for the distance matrix
func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix {
n := len(ksg.sets)
// Create labels from set IDs
labels := make([]string, n)
for i, ks := range ksg.sets {
if ks.Id() != "" {
labels[i] = ks.Id()
} else {
labels[i] = fmt.Sprintf("set_%d", i)
}
}
dm := obidist.NewDistMatrixWithLabels(labels)
// Compute pairwise distances
for i := 0; i < n-1; i++ {
for j := i + 1; j < n; j++ {
distance := ksg.sets[i].JaccardDistance(ksg.sets[j])
dm.Set(i, j, distance)
}
}
return dm
}
// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix for all KmerSets in the group.
// Returns a similarity matrix where element (i, j) represents the Jaccard similarity
// between set i and set j.
//
// The Jaccard similarity is: |A ∩ B| / |A B|
//
// The diagonal is 1.0 (similarity of a set to itself).
//
// The matrix labels are set to the IDs of the individual KmerSets if available,
// otherwise they are set to "set_0", "set_1", etc.
//
// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
// Space complexity: O(n²) for the similarity matrix
func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
n := len(ksg.sets)
// Create labels from set IDs
labels := make([]string, n)
for i, ks := range ksg.sets {
if ks.Id() != "" {
labels[i] = ks.Id()
} else {
labels[i] = fmt.Sprintf("set_%d", i)
}
}
sm := obidist.NewSimilarityMatrixWithLabels(labels)
// Compute pairwise similarities
for i := 0; i < n-1; i++ {
for j := i + 1; j < n; j++ {
similarity := ksg.sets[i].JaccardSimilarity(ksg.sets[j])
sm.Set(i, j, similarity)
}
}
return sm
}

View File

@@ -0,0 +1,231 @@
package obikmer
import (
"math"
"testing"
)
func TestKmerSetGroupJaccardDistanceMatrix(t *testing.T) {
ksg := NewKmerSetGroup(5, 3)
// Set 0: {1, 2, 3}
ksg.Get(0).AddKmerCode(1)
ksg.Get(0).AddKmerCode(2)
ksg.Get(0).AddKmerCode(3)
ksg.Get(0).SetId("set_A")
// Set 1: {2, 3, 4}
ksg.Get(1).AddKmerCode(2)
ksg.Get(1).AddKmerCode(3)
ksg.Get(1).AddKmerCode(4)
ksg.Get(1).SetId("set_B")
// Set 2: {5, 6, 7}
ksg.Get(2).AddKmerCode(5)
ksg.Get(2).AddKmerCode(6)
ksg.Get(2).AddKmerCode(7)
ksg.Get(2).SetId("set_C")
dm := ksg.JaccardDistanceMatrix()
// Check labels
if dm.GetLabel(0) != "set_A" {
t.Errorf("Expected label 'set_A' at index 0, got '%s'", dm.GetLabel(0))
}
if dm.GetLabel(1) != "set_B" {
t.Errorf("Expected label 'set_B' at index 1, got '%s'", dm.GetLabel(1))
}
if dm.GetLabel(2) != "set_C" {
t.Errorf("Expected label 'set_C' at index 2, got '%s'", dm.GetLabel(2))
}
// Check distances
// Distance(0, 1):
// Intersection: {2, 3} -> 2 elements
// Union: {1, 2, 3, 4} -> 4 elements
// Similarity: 2/4 = 0.5
// Distance: 1 - 0.5 = 0.5
expectedDist01 := 0.5
actualDist01 := dm.Get(0, 1)
if math.Abs(actualDist01-expectedDist01) > 1e-10 {
t.Errorf("Distance(0, 1): expected %f, got %f", expectedDist01, actualDist01)
}
// Distance(0, 2):
// Intersection: {} -> 0 elements
// Union: {1, 2, 3, 5, 6, 7} -> 6 elements
// Similarity: 0/6 = 0
// Distance: 1 - 0 = 1.0
expectedDist02 := 1.0
actualDist02 := dm.Get(0, 2)
if math.Abs(actualDist02-expectedDist02) > 1e-10 {
t.Errorf("Distance(0, 2): expected %f, got %f", expectedDist02, actualDist02)
}
// Distance(1, 2):
// Intersection: {} -> 0 elements
// Union: {2, 3, 4, 5, 6, 7} -> 6 elements
// Similarity: 0/6 = 0
// Distance: 1 - 0 = 1.0
expectedDist12 := 1.0
actualDist12 := dm.Get(1, 2)
if math.Abs(actualDist12-expectedDist12) > 1e-10 {
t.Errorf("Distance(1, 2): expected %f, got %f", expectedDist12, actualDist12)
}
// Check symmetry
if dm.Get(0, 1) != dm.Get(1, 0) {
t.Errorf("Matrix not symmetric: Get(0, 1) = %f, Get(1, 0) = %f",
dm.Get(0, 1), dm.Get(1, 0))
}
// Check diagonal
if dm.Get(0, 0) != 0.0 {
t.Errorf("Diagonal should be 0, got %f", dm.Get(0, 0))
}
if dm.Get(1, 1) != 0.0 {
t.Errorf("Diagonal should be 0, got %f", dm.Get(1, 1))
}
if dm.Get(2, 2) != 0.0 {
t.Errorf("Diagonal should be 0, got %f", dm.Get(2, 2))
}
}
func TestKmerSetGroupJaccardSimilarityMatrix(t *testing.T) {
ksg := NewKmerSetGroup(5, 3)
// Set 0: {1, 2, 3}
ksg.Get(0).AddKmerCode(1)
ksg.Get(0).AddKmerCode(2)
ksg.Get(0).AddKmerCode(3)
// Set 1: {2, 3, 4}
ksg.Get(1).AddKmerCode(2)
ksg.Get(1).AddKmerCode(3)
ksg.Get(1).AddKmerCode(4)
// Set 2: {1, 2, 3} (same as set 0)
ksg.Get(2).AddKmerCode(1)
ksg.Get(2).AddKmerCode(2)
ksg.Get(2).AddKmerCode(3)
sm := ksg.JaccardSimilarityMatrix()
// Check similarities
// Similarity(0, 1): 0.5 (as calculated above)
expectedSim01 := 0.5
actualSim01 := sm.Get(0, 1)
if math.Abs(actualSim01-expectedSim01) > 1e-10 {
t.Errorf("Similarity(0, 1): expected %f, got %f", expectedSim01, actualSim01)
}
// Similarity(0, 2): 1.0 (identical sets)
expectedSim02 := 1.0
actualSim02 := sm.Get(0, 2)
if math.Abs(actualSim02-expectedSim02) > 1e-10 {
t.Errorf("Similarity(0, 2): expected %f, got %f", expectedSim02, actualSim02)
}
// Similarity(1, 2): 0.5
// Intersection: {2, 3} -> 2
// Union: {1, 2, 3, 4} -> 4
// Similarity: 2/4 = 0.5
expectedSim12 := 0.5
actualSim12 := sm.Get(1, 2)
if math.Abs(actualSim12-expectedSim12) > 1e-10 {
t.Errorf("Similarity(1, 2): expected %f, got %f", expectedSim12, actualSim12)
}
// Check diagonal (similarity to self = 1.0)
if sm.Get(0, 0) != 1.0 {
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(0, 0))
}
if sm.Get(1, 1) != 1.0 {
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(1, 1))
}
if sm.Get(2, 2) != 1.0 {
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(2, 2))
}
}
func TestKmerSetGroupJaccardMatricesRelation(t *testing.T) {
ksg := NewKmerSetGroup(5, 4)
// Create different sets
ksg.Get(0).AddKmerCode(1)
ksg.Get(0).AddKmerCode(2)
ksg.Get(1).AddKmerCode(2)
ksg.Get(1).AddKmerCode(3)
ksg.Get(2).AddKmerCode(1)
ksg.Get(2).AddKmerCode(2)
ksg.Get(2).AddKmerCode(3)
ksg.Get(3).AddKmerCode(10)
ksg.Get(3).AddKmerCode(20)
dm := ksg.JaccardDistanceMatrix()
sm := ksg.JaccardSimilarityMatrix()
// For all pairs (including diagonal), distance + similarity should equal 1.0
for i := 0; i < 4; i++ {
for j := 0; j < 4; j++ {
distance := dm.Get(i, j)
similarity := sm.Get(i, j)
sum := distance + similarity
if math.Abs(sum-1.0) > 1e-10 {
t.Errorf("At (%d, %d): distance %f + similarity %f = %f, expected 1.0",
i, j, distance, similarity, sum)
}
}
}
}
func TestKmerSetGroupJaccardMatrixLabels(t *testing.T) {
ksg := NewKmerSetGroup(5, 3)
// Don't set IDs - should use default labels
ksg.Get(0).AddKmerCode(1)
ksg.Get(1).AddKmerCode(2)
ksg.Get(2).AddKmerCode(3)
dm := ksg.JaccardDistanceMatrix()
// Check default labels
if dm.GetLabel(0) != "set_0" {
t.Errorf("Expected default label 'set_0', got '%s'", dm.GetLabel(0))
}
if dm.GetLabel(1) != "set_1" {
t.Errorf("Expected default label 'set_1', got '%s'", dm.GetLabel(1))
}
if dm.GetLabel(2) != "set_2" {
t.Errorf("Expected default label 'set_2', got '%s'", dm.GetLabel(2))
}
}
func TestKmerSetGroupJaccardMatrixSize(t *testing.T) {
ksg := NewKmerSetGroup(5, 5)
for i := 0; i < 5; i++ {
ksg.Get(i).AddKmerCode(uint64(i))
}
dm := ksg.JaccardDistanceMatrix()
if dm.Size() != 5 {
t.Errorf("Expected matrix size 5, got %d", dm.Size())
}
// All sets are disjoint, so all distances should be 1.0
for i := 0; i < 5; i++ {
for j := i + 1; j < 5; j++ {
dist := dm.Get(i, j)
if math.Abs(dist-1.0) > 1e-10 {
t.Errorf("Expected distance 1.0 for disjoint sets (%d, %d), got %f",
i, j, dist)
}
}
}
}

View File

@@ -0,0 +1,272 @@
package obikmer
import (
"math"
"testing"
)
func TestJaccardDistanceIdentical(t *testing.T) {
ks1 := NewKmerSet(5)
ks1.AddKmerCode(100)
ks1.AddKmerCode(200)
ks1.AddKmerCode(300)
ks2 := NewKmerSet(5)
ks2.AddKmerCode(100)
ks2.AddKmerCode(200)
ks2.AddKmerCode(300)
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
if distance != 0.0 {
t.Errorf("Expected distance 0.0 for identical sets, got %f", distance)
}
if similarity != 1.0 {
t.Errorf("Expected similarity 1.0 for identical sets, got %f", similarity)
}
}
func TestJaccardDistanceDisjoint(t *testing.T) {
ks1 := NewKmerSet(5)
ks1.AddKmerCode(100)
ks1.AddKmerCode(200)
ks1.AddKmerCode(300)
ks2 := NewKmerSet(5)
ks2.AddKmerCode(400)
ks2.AddKmerCode(500)
ks2.AddKmerCode(600)
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
if distance != 1.0 {
t.Errorf("Expected distance 1.0 for disjoint sets, got %f", distance)
}
if similarity != 0.0 {
t.Errorf("Expected similarity 0.0 for disjoint sets, got %f", similarity)
}
}
func TestJaccardDistancePartialOverlap(t *testing.T) {
// Set 1: {1, 2, 3}
ks1 := NewKmerSet(5)
ks1.AddKmerCode(1)
ks1.AddKmerCode(2)
ks1.AddKmerCode(3)
// Set 2: {2, 3, 4}
ks2 := NewKmerSet(5)
ks2.AddKmerCode(2)
ks2.AddKmerCode(3)
ks2.AddKmerCode(4)
// Intersection: {2, 3} -> cardinality = 2
// Union: {1, 2, 3, 4} -> cardinality = 4
// Similarity = 2/4 = 0.5
// Distance = 1 - 0.5 = 0.5
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
expectedDistance := 0.5
expectedSimilarity := 0.5
if math.Abs(distance-expectedDistance) > 1e-10 {
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
}
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
}
}
func TestJaccardDistanceOneSubsetOfOther(t *testing.T) {
// Set 1: {1, 2}
ks1 := NewKmerSet(5)
ks1.AddKmerCode(1)
ks1.AddKmerCode(2)
// Set 2: {1, 2, 3, 4}
ks2 := NewKmerSet(5)
ks2.AddKmerCode(1)
ks2.AddKmerCode(2)
ks2.AddKmerCode(3)
ks2.AddKmerCode(4)
// Intersection: {1, 2} -> cardinality = 2
// Union: {1, 2, 3, 4} -> cardinality = 4
// Similarity = 2/4 = 0.5
// Distance = 1 - 0.5 = 0.5
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
expectedDistance := 0.5
expectedSimilarity := 0.5
if math.Abs(distance-expectedDistance) > 1e-10 {
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
}
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
}
}
func TestJaccardDistanceEmptySets(t *testing.T) {
ks1 := NewKmerSet(5)
ks2 := NewKmerSet(5)
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
// By convention, distance = 1.0 for empty sets
if distance != 1.0 {
t.Errorf("Expected distance 1.0 for empty sets, got %f", distance)
}
if similarity != 0.0 {
t.Errorf("Expected similarity 0.0 for empty sets, got %f", similarity)
}
}
func TestJaccardDistanceOneEmpty(t *testing.T) {
ks1 := NewKmerSet(5)
ks1.AddKmerCode(1)
ks1.AddKmerCode(2)
ks1.AddKmerCode(3)
ks2 := NewKmerSet(5)
distance := ks1.JaccardDistance(ks2)
similarity := ks1.JaccardSimilarity(ks2)
// Intersection: {} -> cardinality = 0
// Union: {1, 2, 3} -> cardinality = 3
// Similarity = 0/3 = 0.0
// Distance = 1.0
if distance != 1.0 {
t.Errorf("Expected distance 1.0 when one set is empty, got %f", distance)
}
if similarity != 0.0 {
t.Errorf("Expected similarity 0.0 when one set is empty, got %f", similarity)
}
}
func TestJaccardDistanceDifferentK(t *testing.T) {
ks1 := NewKmerSet(5)
ks1.AddKmerCode(1)
ks2 := NewKmerSet(7)
ks2.AddKmerCode(1)
defer func() {
if r := recover(); r == nil {
t.Errorf("Expected panic when computing Jaccard distance with different k values")
}
}()
_ = ks1.JaccardDistance(ks2)
}
func TestJaccardDistanceSimilarityRelation(t *testing.T) {
// Test that distance + similarity = 1.0 for all cases
testCases := []struct {
name string
ks1 *KmerSet
ks2 *KmerSet
}{
{
name: "partial overlap",
ks1: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(1)
ks.AddKmerCode(2)
ks.AddKmerCode(3)
return ks
}(),
ks2: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(2)
ks.AddKmerCode(3)
ks.AddKmerCode(4)
ks.AddKmerCode(5)
return ks
}(),
},
{
name: "identical",
ks1: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(10)
ks.AddKmerCode(20)
return ks
}(),
ks2: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(10)
ks.AddKmerCode(20)
return ks
}(),
},
{
name: "disjoint",
ks1: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(1)
return ks
}(),
ks2: func() *KmerSet {
ks := NewKmerSet(5)
ks.AddKmerCode(100)
return ks
}(),
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
distance := tc.ks1.JaccardDistance(tc.ks2)
similarity := tc.ks1.JaccardSimilarity(tc.ks2)
sum := distance + similarity
if math.Abs(sum-1.0) > 1e-10 {
t.Errorf("Expected distance + similarity = 1.0, got %f + %f = %f",
distance, similarity, sum)
}
})
}
}
func TestJaccardDistanceSymmetry(t *testing.T) {
ks1 := NewKmerSet(5)
ks1.AddKmerCode(1)
ks1.AddKmerCode(2)
ks1.AddKmerCode(3)
ks2 := NewKmerSet(5)
ks2.AddKmerCode(2)
ks2.AddKmerCode(3)
ks2.AddKmerCode(4)
distance1 := ks1.JaccardDistance(ks2)
distance2 := ks2.JaccardDistance(ks1)
similarity1 := ks1.JaccardSimilarity(ks2)
similarity2 := ks2.JaccardSimilarity(ks1)
if math.Abs(distance1-distance2) > 1e-10 {
t.Errorf("Jaccard distance not symmetric: %f vs %f", distance1, distance2)
}
if math.Abs(similarity1-similarity2) > 1e-10 {
t.Errorf("Jaccard similarity not symmetric: %f vs %f", similarity1, similarity2)
}
}

View File

@@ -1,20 +1,14 @@
package obioptions
import (
"fmt"
)
// Version is automatically updated by the Makefile from version.txt
// The patch number (third digit) is incremented on each push to the repository
// TODO: The version number is extracted from git. This induces that the version
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "a43e625"
var _Version = "Release 4.4.0"
var _Version = "Release 4.4.3"
// Version returns the version of the obitools package.
//
// No parameters.
// Returns a string representing the version of the obitools package.
func VersionString() string {
return fmt.Sprintf("%s (%s)", _Version, _Commit)
return _Version
}

1
version.txt Normal file
View File

@@ -0,0 +1 @@
4.4.3