diff --git a/Makefile b/Makefile index a3eeba0..ee8ec71 100644 --- a/Makefile +++ b/Makefile @@ -62,10 +62,10 @@ OUTPUT:=$(shell mktemp) all: install-githook obitools -obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS)) +obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS)) install-githook: $(GITHOOKS) - + $(GITHOOK_DIR)/%: $(GITHOOK_SRC_DIR)/% @echo installing $$(basename $@)... @mkdir -p $(GITHOOK_DIR) @@ -79,10 +79,10 @@ update-deps: test: .FORCE $(GOTEST) ./... -obitests: +obitests: @for t in $$(find obitests -name test.sh -print) ; do \ bash $${t} || exit 1;\ - done + done githubtests: obitools obitests @@ -94,19 +94,30 @@ $(foreach P,$(PACKAGE_DIRS),$(eval $(call MAKE_PKG_RULE,$(P)))) $(foreach P,$(OBITOOLS_DIRS),$(eval $(call MAKE_OBITOOLS_RULE,$(P)))) -pkg/obioptions/version.go: .FORCE -ifneq ($(strip $(COMMIT_ID)),) - @cat $@ \ - | sed -E 's/^var _Commit = "[^"]*"/var _Commit = "'$(COMMIT_ID)'"/' \ - | sed -E 's/^var _Version = "[^"]*"/var _Version = "'"$(LAST_TAG)"'"/' \ +pkg/obioptions/version.go: version.txt .FORCE + @version=$$(cat version.txt); \ + cat $@ \ + | sed -E 's/^var _Version = "[^"]*"/var _Version = "Release '$$version'"/' \ > $(OUTPUT) @diff $@ $(OUTPUT) 2>&1 > /dev/null \ - || echo "Update version.go : $@ to $(LAST_TAG) ($(COMMIT_ID))" \ - && mv $(OUTPUT) $@ + || (echo "Update version.go to $$(cat version.txt)" && mv $(OUTPUT) $@) @rm -f $(OUTPUT) -endif + +bump-version: + @echo "Incrementing version..." + @current=$$(cat version.txt); \ + echo " Current version: $$current"; \ + major=$$(echo $$current | cut -d. -f1); \ + minor=$$(echo $$current | cut -d. -f2); \ + patch=$$(echo $$current | cut -d. -f3); \ + new_patch=$$((patch + 1)); \ + new_version="$$major.$$minor.$$new_patch"; \ + echo " New version: $$new_version"; \ + echo "$$new_version" > version.txt + @echo "✓ Version updated in version.txt" + @$(MAKE) pkg/obioptions/version.go jjnew: @echo "$(YELLOW)→ Creating a new commit...$(NC)" @@ -116,13 +127,21 @@ jjnew: @jj new @echo "$(GREEN)✓ New commit created$(NC)" -jjpush: +jjpush: @echo "$(YELLOW)→ Pushing commit to repository...$(NC)" @echo "$(BLUE)→ Documenting current commit...$(NC)" @jj auto-describe - @echo "$(BLUE)→ Done.$(NC)" - @jj git push --change @ - @echo "$(GREEN)✓ Commit pushed to repository$(NC)" + @echo "$(BLUE)→ Creating new commit for version bump...$(NC)" + @jj new + @$(MAKE) bump-version + @echo "$(BLUE)→ Documenting version bump commit...$(NC)" + @jj auto-describe + @version=$$(cat version.txt); \ + echo "$(BLUE)→ Pushing commits and creating tag v$$version...$(NC)"; \ + jj git push --change @; \ + git tag -a "v$$version" -m "Release $$version" 2>/dev/null || echo "Tag v$$version already exists"; \ + git push origin "v$$version" 2>/dev/null || echo "Tag already pushed" + @echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)" jjfetch: @echo "$(YELLOW)→ Pulling latest commits...$(NC)" @@ -130,5 +149,5 @@ jjfetch: @jj new master@origin @echo "$(GREEN)✓ Latest commits pulled$(NC)" -.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch .FORCE -.FORCE: \ No newline at end of file +.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch bump-version .FORCE +.FORCE: diff --git a/pkg/obidist/dist_matrix.go b/pkg/obidist/dist_matrix.go new file mode 100644 index 0000000..de8a34b --- /dev/null +++ b/pkg/obidist/dist_matrix.go @@ -0,0 +1,272 @@ +package obidist + +import ( + "fmt" +) + +// DistMatrix represents a symmetric matrix stored as a triangular matrix. +// The diagonal has a constant value (typically 0 for distances, 1 for similarities). +// Only the upper triangle (i < j) is stored to save memory. +// +// For an n×n matrix, we store n(n-1)/2 values. +type DistMatrix struct { + n int // Number of elements (matrix dimension) + data []float64 // Triangular storage: upper triangle only + labels []string // Optional labels for rows/columns + diagonalValue float64 // Value on the diagonal +} + +// NewDistMatrix creates a new distance matrix of size n×n. +// All distances are initialized to 0.0, diagonal is 0.0. +func NewDistMatrix(n int) *DistMatrix { + if n < 0 { + panic("matrix size must be non-negative") + } + + // Number of elements in upper triangle: n(n-1)/2 + size := n * (n - 1) / 2 + + return &DistMatrix{ + n: n, + data: make([]float64, size), + labels: make([]string, n), + diagonalValue: 0.0, + } +} + +// NewDistMatrixWithLabels creates a new distance matrix with labels. +// Diagonal is 0.0 by default. +func NewDistMatrixWithLabels(labels []string) *DistMatrix { + dm := NewDistMatrix(len(labels)) + copy(dm.labels, labels) + return dm +} + +// NewSimilarityMatrix creates a new similarity matrix of size n×n. +// All off-diagonal values are initialized to 0.0, diagonal is 1.0. +func NewSimilarityMatrix(n int) *DistMatrix { + if n < 0 { + panic("matrix size must be non-negative") + } + + // Number of elements in upper triangle: n(n-1)/2 + size := n * (n - 1) / 2 + + return &DistMatrix{ + n: n, + data: make([]float64, size), + labels: make([]string, n), + diagonalValue: 1.0, + } +} + +// NewSimilarityMatrixWithLabels creates a new similarity matrix with labels. +// Diagonal is 1.0. +func NewSimilarityMatrixWithLabels(labels []string) *DistMatrix { + dm := NewSimilarityMatrix(len(labels)) + copy(dm.labels, labels) + return dm +} + +// Size returns the dimension of the matrix (n for an n×n matrix). +func (dm *DistMatrix) Size() int { + return dm.n +} + +// indexFor computes the index in the data array for element (i, j). +// Assumes i < j (upper triangle). +// +// The upper triangle is stored row by row: +// (0,1), (0,2), ..., (0,n-1), (1,2), (1,3), ..., (1,n-1), (2,3), ... +// +// For element (i, j) where i < j: +// index = i*(n-1) + j - 1 - i*(i+1)/2 +// +// This can be simplified to: +// index = i*n - i*(i+1)/2 + j - i - 1 +// = i*(n - (i+1)/2 - 1) + j - 1 +// = i*(n - 1 - i/2 - 1/2) + j - 1 +// +// But the clearest formula is: +// index = i*n - i*(i+3)/2 + j - 1 +func (dm *DistMatrix) indexFor(i, j int) int { + if i >= j { + panic(fmt.Sprintf("indexFor expects i < j, got i=%d, j=%d", i, j)) + } + // Formula: number of elements in previous rows + position in current row + // Previous rows (0 to i-1): sum from k=0 to i-1 of (n-1-k) = i*n - i*(i+1)/2 + // Current row position: j - i - 1 + return i*dm.n - i*(i+1)/2 + j - i - 1 +} + +// Get returns the value at position (i, j). +// The matrix is symmetric, so Get(i, j) == Get(j, i). +// The diagonal returns the diagonalValue (0.0 for distances, 1.0 for similarities). +func (dm *DistMatrix) Get(i, j int) float64 { + if i < 0 || i >= dm.n || j < 0 || j >= dm.n { + panic(fmt.Sprintf("indices out of bounds: i=%d, j=%d, n=%d", i, j, dm.n)) + } + + // Diagonal: return the diagonal value + if i == j { + return dm.diagonalValue + } + + // Ensure i < j for indexing + if i > j { + i, j = j, i + } + + return dm.data[dm.indexFor(i, j)] +} + +// Set sets the value at position (i, j). +// The matrix is symmetric, so Set(i, j, v) also sets (j, i) to v. +// Setting the diagonal (i == j) is ignored (diagonal has a fixed value). +func (dm *DistMatrix) Set(i, j int, value float64) { + if i < 0 || i >= dm.n || j < 0 || j >= dm.n { + panic(fmt.Sprintf("indices out of bounds: i=%d, j=%d, n=%d", i, j, dm.n)) + } + + // Ignore diagonal assignments (diagonal has a fixed value) + if i == j { + return + } + + // Ensure i < j for indexing + if i > j { + i, j = j, i + } + + dm.data[dm.indexFor(i, j)] = value +} + +// GetLabel returns the label for element i. +func (dm *DistMatrix) GetLabel(i int) string { + if i < 0 || i >= dm.n { + panic(fmt.Sprintf("index out of bounds: i=%d, n=%d", i, dm.n)) + } + return dm.labels[i] +} + +// SetLabel sets the label for element i. +func (dm *DistMatrix) SetLabel(i int, label string) { + if i < 0 || i >= dm.n { + panic(fmt.Sprintf("index out of bounds: i=%d, n=%d", i, dm.n)) + } + dm.labels[i] = label +} + +// Labels returns a copy of all labels. +func (dm *DistMatrix) Labels() []string { + labels := make([]string, dm.n) + copy(labels, dm.labels) + return labels +} + +// GetRow returns the i-th row of the distance matrix. +// The returned slice is a copy. +func (dm *DistMatrix) GetRow(i int) []float64 { + if i < 0 || i >= dm.n { + panic(fmt.Sprintf("index out of bounds: i=%d, n=%d", i, dm.n)) + } + + row := make([]float64, dm.n) + for j := 0; j < dm.n; j++ { + row[j] = dm.Get(i, j) + } + return row +} + +// GetColumn returns the j-th column of the distance matrix. +// Since the matrix is symmetric, GetColumn(j) == GetRow(j). +// The returned slice is a copy. +func (dm *DistMatrix) GetColumn(j int) []float64 { + return dm.GetRow(j) +} + +// MinDistance returns the minimum non-zero distance in the matrix, +// along with the indices (i, j) where it occurs. +// Returns (0.0, -1, -1) if the matrix is empty or all distances are 0. +func (dm *DistMatrix) MinDistance() (float64, int, int) { + if dm.n <= 1 { + return 0.0, -1, -1 + } + + minDist := -1.0 + minI, minJ := -1, -1 + + for i := 0; i < dm.n-1; i++ { + for j := i + 1; j < dm.n; j++ { + dist := dm.Get(i, j) + if minDist < 0 || dist < minDist { + minDist = dist + minI = i + minJ = j + } + } + } + + if minI < 0 { + return 0.0, -1, -1 + } + + return minDist, minI, minJ +} + +// MaxDistance returns the maximum distance in the matrix, +// along with the indices (i, j) where it occurs. +// Returns (0.0, -1, -1) if the matrix is empty. +func (dm *DistMatrix) MaxDistance() (float64, int, int) { + if dm.n <= 1 { + return 0.0, -1, -1 + } + + maxDist := -1.0 + maxI, maxJ := -1, -1 + + for i := 0; i < dm.n-1; i++ { + for j := i + 1; j < dm.n; j++ { + dist := dm.Get(i, j) + if maxDist < 0 || dist > maxDist { + maxDist = dist + maxI = i + maxJ = j + } + } + } + + if maxI < 0 { + return 0.0, -1, -1 + } + + return maxDist, maxI, maxJ +} + +// Copy creates a deep copy of the matrix. +func (dm *DistMatrix) Copy() *DistMatrix { + newDM := &DistMatrix{ + n: dm.n, + data: make([]float64, len(dm.data)), + labels: make([]string, dm.n), + diagonalValue: dm.diagonalValue, + } + + copy(newDM.data, dm.data) + copy(newDM.labels, dm.labels) + + return newDM +} + +// ToFullMatrix returns a full n×n matrix representation. +// This allocates n² values, so use only when needed. +func (dm *DistMatrix) ToFullMatrix() [][]float64 { + matrix := make([][]float64, dm.n) + for i := 0; i < dm.n; i++ { + matrix[i] = make([]float64, dm.n) + for j := 0; j < dm.n; j++ { + matrix[i][j] = dm.Get(i, j) + } + } + return matrix +} diff --git a/pkg/obidist/dist_matrix_test.go b/pkg/obidist/dist_matrix_test.go new file mode 100644 index 0000000..bae9e17 --- /dev/null +++ b/pkg/obidist/dist_matrix_test.go @@ -0,0 +1,386 @@ +package obidist + +import ( + "math" + "testing" +) + +func TestNewDistMatrix(t *testing.T) { + dm := NewDistMatrix(5) + + if dm.Size() != 5 { + t.Errorf("Expected size 5, got %d", dm.Size()) + } + + // Check that all values are initialized to 0 + for i := 0; i < 5; i++ { + for j := 0; j < 5; j++ { + if dm.Get(i, j) != 0.0 { + t.Errorf("Expected 0.0 at (%d, %d), got %f", i, j, dm.Get(i, j)) + } + } + } +} + +func TestDistMatrixDiagonal(t *testing.T) { + dm := NewDistMatrix(5) + + // Diagonal should always be 0 + for i := 0; i < 5; i++ { + if dm.Get(i, i) != 0.0 { + t.Errorf("Expected diagonal element (%d, %d) to be 0.0, got %f", i, i, dm.Get(i, i)) + } + } + + // Try to set diagonal (should be ignored) + dm.Set(2, 2, 5.0) + if dm.Get(2, 2) != 0.0 { + t.Errorf("Diagonal should remain 0.0 even after Set, got %f", dm.Get(2, 2)) + } +} + +func TestDistMatrixSymmetry(t *testing.T) { + dm := NewDistMatrix(4) + + dm.Set(0, 1, 1.5) + dm.Set(0, 2, 2.5) + dm.Set(1, 3, 3.5) + + // Check symmetry + if dm.Get(0, 1) != dm.Get(1, 0) { + t.Errorf("Matrix not symmetric: Get(0,1)=%f, Get(1,0)=%f", dm.Get(0, 1), dm.Get(1, 0)) + } + + if dm.Get(0, 2) != dm.Get(2, 0) { + t.Errorf("Matrix not symmetric: Get(0,2)=%f, Get(2,0)=%f", dm.Get(0, 2), dm.Get(2, 0)) + } + + if dm.Get(1, 3) != dm.Get(3, 1) { + t.Errorf("Matrix not symmetric: Get(1,3)=%f, Get(3,1)=%f", dm.Get(1, 3), dm.Get(3, 1)) + } +} + +func TestDistMatrixSetGet(t *testing.T) { + dm := NewDistMatrix(4) + + testCases := []struct { + i int + j int + value float64 + }{ + {0, 1, 1.5}, + {0, 2, 2.5}, + {0, 3, 3.5}, + {1, 2, 4.5}, + {1, 3, 5.5}, + {2, 3, 6.5}, + } + + for _, tc := range testCases { + dm.Set(tc.i, tc.j, tc.value) + } + + for _, tc := range testCases { + got := dm.Get(tc.i, tc.j) + if math.Abs(got-tc.value) > 1e-10 { + t.Errorf("Get(%d, %d): expected %f, got %f", tc.i, tc.j, tc.value, got) + } + + // Check symmetry + got = dm.Get(tc.j, tc.i) + if math.Abs(got-tc.value) > 1e-10 { + t.Errorf("Get(%d, %d) (symmetric): expected %f, got %f", tc.j, tc.i, tc.value, got) + } + } +} + +func TestDistMatrixLabels(t *testing.T) { + labels := []string{"A", "B", "C", "D"} + dm := NewDistMatrixWithLabels(labels) + + if dm.Size() != 4 { + t.Errorf("Expected size 4, got %d", dm.Size()) + } + + for i, label := range labels { + if dm.GetLabel(i) != label { + t.Errorf("Expected label %s at index %d, got %s", label, i, dm.GetLabel(i)) + } + } + + // Modify a label + dm.SetLabel(1, "Modified") + if dm.GetLabel(1) != "Modified" { + t.Errorf("Expected label 'Modified' at index 1, got %s", dm.GetLabel(1)) + } + + // Check Labels() returns a copy + labelsCopy := dm.Labels() + labelsCopy[0] = "ChangedCopy" + if dm.GetLabel(0) != "A" { + t.Errorf("Modifying Labels() return value should not affect original labels") + } +} + +func TestDistMatrixMinDistance(t *testing.T) { + dm := NewDistMatrix(4) + + dm.Set(0, 1, 2.5) + dm.Set(0, 2, 1.5) // minimum + dm.Set(0, 3, 3.5) + dm.Set(1, 2, 4.5) + dm.Set(1, 3, 5.5) + dm.Set(2, 3, 6.5) + + minDist, minI, minJ := dm.MinDistance() + + if math.Abs(minDist-1.5) > 1e-10 { + t.Errorf("Expected min distance 1.5, got %f", minDist) + } + + if (minI != 0 || minJ != 2) && (minI != 2 || minJ != 0) { + t.Errorf("Expected min at (0, 2) or (2, 0), got (%d, %d)", minI, minJ) + } +} + +func TestDistMatrixMaxDistance(t *testing.T) { + dm := NewDistMatrix(4) + + dm.Set(0, 1, 2.5) + dm.Set(0, 2, 1.5) + dm.Set(0, 3, 3.5) + dm.Set(1, 2, 4.5) + dm.Set(1, 3, 5.5) + dm.Set(2, 3, 6.5) // maximum + + maxDist, maxI, maxJ := dm.MaxDistance() + + if math.Abs(maxDist-6.5) > 1e-10 { + t.Errorf("Expected max distance 6.5, got %f", maxDist) + } + + if (maxI != 2 || maxJ != 3) && (maxI != 3 || maxJ != 2) { + t.Errorf("Expected max at (2, 3) or (3, 2), got (%d, %d)", maxI, maxJ) + } +} + +func TestDistMatrixGetRow(t *testing.T) { + dm := NewDistMatrix(3) + + dm.Set(0, 1, 1.0) + dm.Set(0, 2, 2.0) + dm.Set(1, 2, 3.0) + + row0 := dm.GetRow(0) + expected0 := []float64{0.0, 1.0, 2.0} + + for i, val := range expected0 { + if math.Abs(row0[i]-val) > 1e-10 { + t.Errorf("Row 0[%d]: expected %f, got %f", i, val, row0[i]) + } + } + + row1 := dm.GetRow(1) + expected1 := []float64{1.0, 0.0, 3.0} + + for i, val := range expected1 { + if math.Abs(row1[i]-val) > 1e-10 { + t.Errorf("Row 1[%d]: expected %f, got %f", i, val, row1[i]) + } + } +} + +func TestDistMatrixCopy(t *testing.T) { + dm := NewDistMatrixWithLabels([]string{"A", "B", "C"}) + dm.Set(0, 1, 1.5) + dm.Set(0, 2, 2.5) + dm.Set(1, 2, 3.5) + + dmCopy := dm.Copy() + + // Check values are copied + if dmCopy.Get(0, 1) != dm.Get(0, 1) { + t.Errorf("Copy has different value at (0, 1)") + } + + // Check labels are copied + if dmCopy.GetLabel(0) != dm.GetLabel(0) { + t.Errorf("Copy has different label at index 0") + } + + // Modify copy and ensure original unchanged + dmCopy.Set(0, 1, 99.9) + if dm.Get(0, 1) == 99.9 { + t.Errorf("Modifying copy affected original matrix") + } + + dmCopy.SetLabel(0, "Modified") + if dm.GetLabel(0) == "Modified" { + t.Errorf("Modifying copy label affected original matrix") + } +} + +func TestDistMatrixToFullMatrix(t *testing.T) { + dm := NewDistMatrix(3) + dm.Set(0, 1, 1.0) + dm.Set(0, 2, 2.0) + dm.Set(1, 2, 3.0) + + full := dm.ToFullMatrix() + + expected := [][]float64{ + {0.0, 1.0, 2.0}, + {1.0, 0.0, 3.0}, + {2.0, 3.0, 0.0}, + } + + for i := 0; i < 3; i++ { + for j := 0; j < 3; j++ { + if math.Abs(full[i][j]-expected[i][j]) > 1e-10 { + t.Errorf("Full matrix[%d][%d]: expected %f, got %f", + i, j, expected[i][j], full[i][j]) + } + } + } +} + +func TestDistMatrixBoundsChecking(t *testing.T) { + dm := NewDistMatrix(3) + + // Test Get out of bounds + testPanic := func(f func()) { + defer func() { + if r := recover(); r == nil { + t.Errorf("Expected panic, but didn't get one") + } + }() + f() + } + + testPanic(func() { dm.Get(-1, 0) }) + testPanic(func() { dm.Get(0, 3) }) + testPanic(func() { dm.Set(3, 0, 1.0) }) + testPanic(func() { dm.GetLabel(-1) }) + testPanic(func() { dm.SetLabel(3, "Invalid") }) + testPanic(func() { dm.GetRow(3) }) +} + +func TestDistMatrixEmptyMatrix(t *testing.T) { + dm := NewDistMatrix(0) + + if dm.Size() != 0 { + t.Errorf("Expected size 0, got %d", dm.Size()) + } + + minDist, minI, minJ := dm.MinDistance() + if minDist != 0.0 || minI != -1 || minJ != -1 { + t.Errorf("Empty matrix MinDistance should return (0.0, -1, -1), got (%f, %d, %d)", + minDist, minI, minJ) + } + + maxDist, maxI, maxJ := dm.MaxDistance() + if maxDist != 0.0 || maxI != -1 || maxJ != -1 { + t.Errorf("Empty matrix MaxDistance should return (0.0, -1, -1), got (%f, %d, %d)", + maxDist, maxI, maxJ) + } +} + +func TestDistMatrixSingleElement(t *testing.T) { + dm := NewDistMatrix(1) + + if dm.Size() != 1 { + t.Errorf("Expected size 1, got %d", dm.Size()) + } + + // Only element is diagonal (always 0) + if dm.Get(0, 0) != 0.0 { + t.Errorf("Expected 0.0 at (0, 0), got %f", dm.Get(0, 0)) + } + + minDist, minI, minJ := dm.MinDistance() + if minDist != 0.0 || minI != -1 || minJ != -1 { + t.Errorf("Single element matrix MinDistance should return (0.0, -1, -1), got (%f, %d, %d)", + minDist, minI, minJ) + } +} + +func TestNewSimilarityMatrix(t *testing.T) { + sm := NewSimilarityMatrix(4) + + if sm.Size() != 4 { + t.Errorf("Expected size 4, got %d", sm.Size()) + } + + // Check diagonal is 1.0 + for i := 0; i < 4; i++ { + if sm.Get(i, i) != 1.0 { + t.Errorf("Expected diagonal element (%d, %d) to be 1.0, got %f", i, i, sm.Get(i, i)) + } + } + + // Check off-diagonal is 0.0 + if sm.Get(0, 1) != 0.0 { + t.Errorf("Expected off-diagonal to be 0.0, got %f", sm.Get(0, 1)) + } +} + +func TestNewSimilarityMatrixWithLabels(t *testing.T) { + labels := []string{"A", "B", "C"} + sm := NewSimilarityMatrixWithLabels(labels) + + if sm.Size() != 3 { + t.Errorf("Expected size 3, got %d", sm.Size()) + } + + // Check labels + for i, label := range labels { + if sm.GetLabel(i) != label { + t.Errorf("Expected label %s at index %d, got %s", label, i, sm.GetLabel(i)) + } + } + + // Check diagonal is 1.0 + for i := 0; i < 3; i++ { + if sm.Get(i, i) != 1.0 { + t.Errorf("Expected diagonal element (%d, %d) to be 1.0, got %f", i, i, sm.Get(i, i)) + } + } + + // Set some similarities + sm.Set(0, 1, 0.8) + sm.Set(0, 2, 0.6) + sm.Set(1, 2, 0.7) + + // Check values + if math.Abs(sm.Get(0, 1)-0.8) > 1e-10 { + t.Errorf("Expected 0.8 at (0, 1), got %f", sm.Get(0, 1)) + } + + if math.Abs(sm.Get(1, 0)-0.8) > 1e-10 { + t.Errorf("Expected 0.8 at (1, 0) (symmetry), got %f", sm.Get(1, 0)) + } +} + +func TestSimilarityMatrixCopy(t *testing.T) { + sm := NewSimilarityMatrix(3) + sm.Set(0, 1, 0.9) + sm.Set(0, 2, 0.7) + + smCopy := sm.Copy() + + // Check diagonal is preserved + if smCopy.Get(0, 0) != 1.0 { + t.Errorf("Copied similarity matrix should have diagonal 1.0, got %f", smCopy.Get(0, 0)) + } + + // Check values are preserved + if math.Abs(smCopy.Get(0, 1)-0.9) > 1e-10 { + t.Errorf("Copy should preserve values, expected 0.9, got %f", smCopy.Get(0, 1)) + } + + // Modify copy and ensure original unchanged + smCopy.Set(0, 1, 0.5) + if math.Abs(sm.Get(0, 1)-0.9) > 1e-10 { + t.Errorf("Modifying copy should not affect original, expected 0.9, got %f", sm.Get(0, 1)) + } +} diff --git a/pkg/obikmer/kmer_set.go b/pkg/obikmer/kmer_set.go index dd36054..f295072 100644 --- a/pkg/obikmer/kmer_set.go +++ b/pkg/obikmer/kmer_set.go @@ -158,6 +158,54 @@ func (ks *KmerSet) Difference(other *KmerSet) *KmerSet { return NewKmerSetFromBitmap(ks.k, result) } +// JaccardDistance computes the Jaccard distance between two KmerSets. +// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A ∪ B|) +// where A and B are the two sets. +// +// Returns: +// - 0.0 when sets are identical (distance = 0, similarity = 1) +// - 1.0 when sets are completely disjoint (distance = 1, similarity = 0) +// - 1.0 when both sets are empty (by convention) +// +// Time complexity: O(|A| + |B|) for Roaring Bitmap operations +// Space complexity: O(1) as operations are done in-place on temporary bitmaps +func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 { + if ks.k != other.k { + panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k)) + } + + // Compute intersection cardinality + intersectionCard := ks.bitmap.AndCardinality(other.bitmap) + + // Compute union cardinality + unionCard := ks.bitmap.OrCardinality(other.bitmap) + + // If union is empty, both sets are empty - return 1.0 by convention + if unionCard == 0 { + return 1.0 + } + + // Jaccard similarity = |A ∩ B| / |A ∪ B| + similarity := float64(intersectionCard) / float64(unionCard) + + // Jaccard distance = 1 - similarity + return 1.0 - similarity +} + +// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets. +// The Jaccard similarity is defined as: |A ∩ B| / |A ∪ B| +// +// Returns: +// - 1.0 when sets are identical (maximum similarity) +// - 0.0 when sets are completely disjoint (no similarity) +// - 0.0 when both sets are empty (by convention) +// +// Time complexity: O(|A| + |B|) for Roaring Bitmap operations +// Space complexity: O(1) as operations are done in-place on temporary bitmaps +func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 { + return 1.0 - ks.JaccardDistance(other) +} + // Iterator returns an iterator over all k-mers in the set func (ks *KmerSet) Iterator() roaring64.IntIterable64 { return ks.bitmap.Iterator() diff --git a/pkg/obikmer/kmer_set_group.go b/pkg/obikmer/kmer_set_group.go index 193c272..c008665 100644 --- a/pkg/obikmer/kmer_set_group.go +++ b/pkg/obikmer/kmer_set_group.go @@ -3,6 +3,7 @@ package obikmer import ( "fmt" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" ) @@ -260,3 +261,79 @@ Set breakdown: return result } + +// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix for all KmerSets in the group. +// Returns a triangular distance matrix where element (i, j) represents the Jaccard distance +// between set i and set j. +// +// The Jaccard distance is: 1 - (|A ∩ B| / |A ∪ B|) +// +// The matrix labels are set to the IDs of the individual KmerSets if available, +// otherwise they are set to "set_0", "set_1", etc. +// +// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets +// Space complexity: O(n²) for the distance matrix +func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix { + n := len(ksg.sets) + + // Create labels from set IDs + labels := make([]string, n) + for i, ks := range ksg.sets { + if ks.Id() != "" { + labels[i] = ks.Id() + } else { + labels[i] = fmt.Sprintf("set_%d", i) + } + } + + dm := obidist.NewDistMatrixWithLabels(labels) + + // Compute pairwise distances + for i := 0; i < n-1; i++ { + for j := i + 1; j < n; j++ { + distance := ksg.sets[i].JaccardDistance(ksg.sets[j]) + dm.Set(i, j, distance) + } + } + + return dm +} + +// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix for all KmerSets in the group. +// Returns a similarity matrix where element (i, j) represents the Jaccard similarity +// between set i and set j. +// +// The Jaccard similarity is: |A ∩ B| / |A ∪ B| +// +// The diagonal is 1.0 (similarity of a set to itself). +// +// The matrix labels are set to the IDs of the individual KmerSets if available, +// otherwise they are set to "set_0", "set_1", etc. +// +// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets +// Space complexity: O(n²) for the similarity matrix +func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix { + n := len(ksg.sets) + + // Create labels from set IDs + labels := make([]string, n) + for i, ks := range ksg.sets { + if ks.Id() != "" { + labels[i] = ks.Id() + } else { + labels[i] = fmt.Sprintf("set_%d", i) + } + } + + sm := obidist.NewSimilarityMatrixWithLabels(labels) + + // Compute pairwise similarities + for i := 0; i < n-1; i++ { + for j := i + 1; j < n; j++ { + similarity := ksg.sets[i].JaccardSimilarity(ksg.sets[j]) + sm.Set(i, j, similarity) + } + } + + return sm +} diff --git a/pkg/obikmer/kmer_set_group_jaccard_test.go b/pkg/obikmer/kmer_set_group_jaccard_test.go new file mode 100644 index 0000000..1e17d02 --- /dev/null +++ b/pkg/obikmer/kmer_set_group_jaccard_test.go @@ -0,0 +1,231 @@ +package obikmer + +import ( + "math" + "testing" +) + +func TestKmerSetGroupJaccardDistanceMatrix(t *testing.T) { + ksg := NewKmerSetGroup(5, 3) + + // Set 0: {1, 2, 3} + ksg.Get(0).AddKmerCode(1) + ksg.Get(0).AddKmerCode(2) + ksg.Get(0).AddKmerCode(3) + ksg.Get(0).SetId("set_A") + + // Set 1: {2, 3, 4} + ksg.Get(1).AddKmerCode(2) + ksg.Get(1).AddKmerCode(3) + ksg.Get(1).AddKmerCode(4) + ksg.Get(1).SetId("set_B") + + // Set 2: {5, 6, 7} + ksg.Get(2).AddKmerCode(5) + ksg.Get(2).AddKmerCode(6) + ksg.Get(2).AddKmerCode(7) + ksg.Get(2).SetId("set_C") + + dm := ksg.JaccardDistanceMatrix() + + // Check labels + if dm.GetLabel(0) != "set_A" { + t.Errorf("Expected label 'set_A' at index 0, got '%s'", dm.GetLabel(0)) + } + if dm.GetLabel(1) != "set_B" { + t.Errorf("Expected label 'set_B' at index 1, got '%s'", dm.GetLabel(1)) + } + if dm.GetLabel(2) != "set_C" { + t.Errorf("Expected label 'set_C' at index 2, got '%s'", dm.GetLabel(2)) + } + + // Check distances + // Distance(0, 1): + // Intersection: {2, 3} -> 2 elements + // Union: {1, 2, 3, 4} -> 4 elements + // Similarity: 2/4 = 0.5 + // Distance: 1 - 0.5 = 0.5 + expectedDist01 := 0.5 + actualDist01 := dm.Get(0, 1) + if math.Abs(actualDist01-expectedDist01) > 1e-10 { + t.Errorf("Distance(0, 1): expected %f, got %f", expectedDist01, actualDist01) + } + + // Distance(0, 2): + // Intersection: {} -> 0 elements + // Union: {1, 2, 3, 5, 6, 7} -> 6 elements + // Similarity: 0/6 = 0 + // Distance: 1 - 0 = 1.0 + expectedDist02 := 1.0 + actualDist02 := dm.Get(0, 2) + if math.Abs(actualDist02-expectedDist02) > 1e-10 { + t.Errorf("Distance(0, 2): expected %f, got %f", expectedDist02, actualDist02) + } + + // Distance(1, 2): + // Intersection: {} -> 0 elements + // Union: {2, 3, 4, 5, 6, 7} -> 6 elements + // Similarity: 0/6 = 0 + // Distance: 1 - 0 = 1.0 + expectedDist12 := 1.0 + actualDist12 := dm.Get(1, 2) + if math.Abs(actualDist12-expectedDist12) > 1e-10 { + t.Errorf("Distance(1, 2): expected %f, got %f", expectedDist12, actualDist12) + } + + // Check symmetry + if dm.Get(0, 1) != dm.Get(1, 0) { + t.Errorf("Matrix not symmetric: Get(0, 1) = %f, Get(1, 0) = %f", + dm.Get(0, 1), dm.Get(1, 0)) + } + + // Check diagonal + if dm.Get(0, 0) != 0.0 { + t.Errorf("Diagonal should be 0, got %f", dm.Get(0, 0)) + } + if dm.Get(1, 1) != 0.0 { + t.Errorf("Diagonal should be 0, got %f", dm.Get(1, 1)) + } + if dm.Get(2, 2) != 0.0 { + t.Errorf("Diagonal should be 0, got %f", dm.Get(2, 2)) + } +} + +func TestKmerSetGroupJaccardSimilarityMatrix(t *testing.T) { + ksg := NewKmerSetGroup(5, 3) + + // Set 0: {1, 2, 3} + ksg.Get(0).AddKmerCode(1) + ksg.Get(0).AddKmerCode(2) + ksg.Get(0).AddKmerCode(3) + + // Set 1: {2, 3, 4} + ksg.Get(1).AddKmerCode(2) + ksg.Get(1).AddKmerCode(3) + ksg.Get(1).AddKmerCode(4) + + // Set 2: {1, 2, 3} (same as set 0) + ksg.Get(2).AddKmerCode(1) + ksg.Get(2).AddKmerCode(2) + ksg.Get(2).AddKmerCode(3) + + sm := ksg.JaccardSimilarityMatrix() + + // Check similarities + // Similarity(0, 1): 0.5 (as calculated above) + expectedSim01 := 0.5 + actualSim01 := sm.Get(0, 1) + if math.Abs(actualSim01-expectedSim01) > 1e-10 { + t.Errorf("Similarity(0, 1): expected %f, got %f", expectedSim01, actualSim01) + } + + // Similarity(0, 2): 1.0 (identical sets) + expectedSim02 := 1.0 + actualSim02 := sm.Get(0, 2) + if math.Abs(actualSim02-expectedSim02) > 1e-10 { + t.Errorf("Similarity(0, 2): expected %f, got %f", expectedSim02, actualSim02) + } + + // Similarity(1, 2): 0.5 + // Intersection: {2, 3} -> 2 + // Union: {1, 2, 3, 4} -> 4 + // Similarity: 2/4 = 0.5 + expectedSim12 := 0.5 + actualSim12 := sm.Get(1, 2) + if math.Abs(actualSim12-expectedSim12) > 1e-10 { + t.Errorf("Similarity(1, 2): expected %f, got %f", expectedSim12, actualSim12) + } + + // Check diagonal (similarity to self = 1.0) + if sm.Get(0, 0) != 1.0 { + t.Errorf("Diagonal should be 1.0, got %f", sm.Get(0, 0)) + } + if sm.Get(1, 1) != 1.0 { + t.Errorf("Diagonal should be 1.0, got %f", sm.Get(1, 1)) + } + if sm.Get(2, 2) != 1.0 { + t.Errorf("Diagonal should be 1.0, got %f", sm.Get(2, 2)) + } +} + +func TestKmerSetGroupJaccardMatricesRelation(t *testing.T) { + ksg := NewKmerSetGroup(5, 4) + + // Create different sets + ksg.Get(0).AddKmerCode(1) + ksg.Get(0).AddKmerCode(2) + + ksg.Get(1).AddKmerCode(2) + ksg.Get(1).AddKmerCode(3) + + ksg.Get(2).AddKmerCode(1) + ksg.Get(2).AddKmerCode(2) + ksg.Get(2).AddKmerCode(3) + + ksg.Get(3).AddKmerCode(10) + ksg.Get(3).AddKmerCode(20) + + dm := ksg.JaccardDistanceMatrix() + sm := ksg.JaccardSimilarityMatrix() + + // For all pairs (including diagonal), distance + similarity should equal 1.0 + for i := 0; i < 4; i++ { + for j := 0; j < 4; j++ { + distance := dm.Get(i, j) + similarity := sm.Get(i, j) + sum := distance + similarity + + if math.Abs(sum-1.0) > 1e-10 { + t.Errorf("At (%d, %d): distance %f + similarity %f = %f, expected 1.0", + i, j, distance, similarity, sum) + } + } + } +} + +func TestKmerSetGroupJaccardMatrixLabels(t *testing.T) { + ksg := NewKmerSetGroup(5, 3) + + // Don't set IDs - should use default labels + ksg.Get(0).AddKmerCode(1) + ksg.Get(1).AddKmerCode(2) + ksg.Get(2).AddKmerCode(3) + + dm := ksg.JaccardDistanceMatrix() + + // Check default labels + if dm.GetLabel(0) != "set_0" { + t.Errorf("Expected default label 'set_0', got '%s'", dm.GetLabel(0)) + } + if dm.GetLabel(1) != "set_1" { + t.Errorf("Expected default label 'set_1', got '%s'", dm.GetLabel(1)) + } + if dm.GetLabel(2) != "set_2" { + t.Errorf("Expected default label 'set_2', got '%s'", dm.GetLabel(2)) + } +} + +func TestKmerSetGroupJaccardMatrixSize(t *testing.T) { + ksg := NewKmerSetGroup(5, 5) + + for i := 0; i < 5; i++ { + ksg.Get(i).AddKmerCode(uint64(i)) + } + + dm := ksg.JaccardDistanceMatrix() + + if dm.Size() != 5 { + t.Errorf("Expected matrix size 5, got %d", dm.Size()) + } + + // All sets are disjoint, so all distances should be 1.0 + for i := 0; i < 5; i++ { + for j := i + 1; j < 5; j++ { + dist := dm.Get(i, j) + if math.Abs(dist-1.0) > 1e-10 { + t.Errorf("Expected distance 1.0 for disjoint sets (%d, %d), got %f", + i, j, dist) + } + } + } +} diff --git a/pkg/obikmer/kmer_set_test.go b/pkg/obikmer/kmer_set_test.go new file mode 100644 index 0000000..77144c7 --- /dev/null +++ b/pkg/obikmer/kmer_set_test.go @@ -0,0 +1,272 @@ +package obikmer + +import ( + "math" + "testing" +) + +func TestJaccardDistanceIdentical(t *testing.T) { + ks1 := NewKmerSet(5) + ks1.AddKmerCode(100) + ks1.AddKmerCode(200) + ks1.AddKmerCode(300) + + ks2 := NewKmerSet(5) + ks2.AddKmerCode(100) + ks2.AddKmerCode(200) + ks2.AddKmerCode(300) + + distance := ks1.JaccardDistance(ks2) + similarity := ks1.JaccardSimilarity(ks2) + + if distance != 0.0 { + t.Errorf("Expected distance 0.0 for identical sets, got %f", distance) + } + + if similarity != 1.0 { + t.Errorf("Expected similarity 1.0 for identical sets, got %f", similarity) + } +} + +func TestJaccardDistanceDisjoint(t *testing.T) { + ks1 := NewKmerSet(5) + ks1.AddKmerCode(100) + ks1.AddKmerCode(200) + ks1.AddKmerCode(300) + + ks2 := NewKmerSet(5) + ks2.AddKmerCode(400) + ks2.AddKmerCode(500) + ks2.AddKmerCode(600) + + distance := ks1.JaccardDistance(ks2) + similarity := ks1.JaccardSimilarity(ks2) + + if distance != 1.0 { + t.Errorf("Expected distance 1.0 for disjoint sets, got %f", distance) + } + + if similarity != 0.0 { + t.Errorf("Expected similarity 0.0 for disjoint sets, got %f", similarity) + } +} + +func TestJaccardDistancePartialOverlap(t *testing.T) { + // Set 1: {1, 2, 3} + ks1 := NewKmerSet(5) + ks1.AddKmerCode(1) + ks1.AddKmerCode(2) + ks1.AddKmerCode(3) + + // Set 2: {2, 3, 4} + ks2 := NewKmerSet(5) + ks2.AddKmerCode(2) + ks2.AddKmerCode(3) + ks2.AddKmerCode(4) + + // Intersection: {2, 3} -> cardinality = 2 + // Union: {1, 2, 3, 4} -> cardinality = 4 + // Similarity = 2/4 = 0.5 + // Distance = 1 - 0.5 = 0.5 + + distance := ks1.JaccardDistance(ks2) + similarity := ks1.JaccardSimilarity(ks2) + + expectedDistance := 0.5 + expectedSimilarity := 0.5 + + if math.Abs(distance-expectedDistance) > 1e-10 { + t.Errorf("Expected distance %f, got %f", expectedDistance, distance) + } + + if math.Abs(similarity-expectedSimilarity) > 1e-10 { + t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity) + } +} + +func TestJaccardDistanceOneSubsetOfOther(t *testing.T) { + // Set 1: {1, 2} + ks1 := NewKmerSet(5) + ks1.AddKmerCode(1) + ks1.AddKmerCode(2) + + // Set 2: {1, 2, 3, 4} + ks2 := NewKmerSet(5) + ks2.AddKmerCode(1) + ks2.AddKmerCode(2) + ks2.AddKmerCode(3) + ks2.AddKmerCode(4) + + // Intersection: {1, 2} -> cardinality = 2 + // Union: {1, 2, 3, 4} -> cardinality = 4 + // Similarity = 2/4 = 0.5 + // Distance = 1 - 0.5 = 0.5 + + distance := ks1.JaccardDistance(ks2) + similarity := ks1.JaccardSimilarity(ks2) + + expectedDistance := 0.5 + expectedSimilarity := 0.5 + + if math.Abs(distance-expectedDistance) > 1e-10 { + t.Errorf("Expected distance %f, got %f", expectedDistance, distance) + } + + if math.Abs(similarity-expectedSimilarity) > 1e-10 { + t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity) + } +} + +func TestJaccardDistanceEmptySets(t *testing.T) { + ks1 := NewKmerSet(5) + ks2 := NewKmerSet(5) + + distance := ks1.JaccardDistance(ks2) + similarity := ks1.JaccardSimilarity(ks2) + + // By convention, distance = 1.0 for empty sets + if distance != 1.0 { + t.Errorf("Expected distance 1.0 for empty sets, got %f", distance) + } + + if similarity != 0.0 { + t.Errorf("Expected similarity 0.0 for empty sets, got %f", similarity) + } +} + +func TestJaccardDistanceOneEmpty(t *testing.T) { + ks1 := NewKmerSet(5) + ks1.AddKmerCode(1) + ks1.AddKmerCode(2) + ks1.AddKmerCode(3) + + ks2 := NewKmerSet(5) + + distance := ks1.JaccardDistance(ks2) + similarity := ks1.JaccardSimilarity(ks2) + + // Intersection: {} -> cardinality = 0 + // Union: {1, 2, 3} -> cardinality = 3 + // Similarity = 0/3 = 0.0 + // Distance = 1.0 + + if distance != 1.0 { + t.Errorf("Expected distance 1.0 when one set is empty, got %f", distance) + } + + if similarity != 0.0 { + t.Errorf("Expected similarity 0.0 when one set is empty, got %f", similarity) + } +} + +func TestJaccardDistanceDifferentK(t *testing.T) { + ks1 := NewKmerSet(5) + ks1.AddKmerCode(1) + + ks2 := NewKmerSet(7) + ks2.AddKmerCode(1) + + defer func() { + if r := recover(); r == nil { + t.Errorf("Expected panic when computing Jaccard distance with different k values") + } + }() + + _ = ks1.JaccardDistance(ks2) +} + +func TestJaccardDistanceSimilarityRelation(t *testing.T) { + // Test that distance + similarity = 1.0 for all cases + testCases := []struct { + name string + ks1 *KmerSet + ks2 *KmerSet + }{ + { + name: "partial overlap", + ks1: func() *KmerSet { + ks := NewKmerSet(5) + ks.AddKmerCode(1) + ks.AddKmerCode(2) + ks.AddKmerCode(3) + return ks + }(), + ks2: func() *KmerSet { + ks := NewKmerSet(5) + ks.AddKmerCode(2) + ks.AddKmerCode(3) + ks.AddKmerCode(4) + ks.AddKmerCode(5) + return ks + }(), + }, + { + name: "identical", + ks1: func() *KmerSet { + ks := NewKmerSet(5) + ks.AddKmerCode(10) + ks.AddKmerCode(20) + return ks + }(), + ks2: func() *KmerSet { + ks := NewKmerSet(5) + ks.AddKmerCode(10) + ks.AddKmerCode(20) + return ks + }(), + }, + { + name: "disjoint", + ks1: func() *KmerSet { + ks := NewKmerSet(5) + ks.AddKmerCode(1) + return ks + }(), + ks2: func() *KmerSet { + ks := NewKmerSet(5) + ks.AddKmerCode(100) + return ks + }(), + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + distance := tc.ks1.JaccardDistance(tc.ks2) + similarity := tc.ks1.JaccardSimilarity(tc.ks2) + + sum := distance + similarity + + if math.Abs(sum-1.0) > 1e-10 { + t.Errorf("Expected distance + similarity = 1.0, got %f + %f = %f", + distance, similarity, sum) + } + }) + } +} + +func TestJaccardDistanceSymmetry(t *testing.T) { + ks1 := NewKmerSet(5) + ks1.AddKmerCode(1) + ks1.AddKmerCode(2) + ks1.AddKmerCode(3) + + ks2 := NewKmerSet(5) + ks2.AddKmerCode(2) + ks2.AddKmerCode(3) + ks2.AddKmerCode(4) + + distance1 := ks1.JaccardDistance(ks2) + distance2 := ks2.JaccardDistance(ks1) + + similarity1 := ks1.JaccardSimilarity(ks2) + similarity2 := ks2.JaccardSimilarity(ks1) + + if math.Abs(distance1-distance2) > 1e-10 { + t.Errorf("Jaccard distance not symmetric: %f vs %f", distance1, distance2) + } + + if math.Abs(similarity1-similarity2) > 1e-10 { + t.Errorf("Jaccard similarity not symmetric: %f vs %f", similarity1, similarity2) + } +} diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 089e158..8922302 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -1,20 +1,14 @@ package obioptions -import ( - "fmt" -) +// Version is automatically updated by the Makefile from version.txt +// The patch number (third digit) is incremented on each push to the repository -// TODO: The version number is extracted from git. This induces that the version -// corresponds to the last commit, and not the one when the file will be -// commited - -var _Commit = "a43e625" -var _Version = "Release 4.4.0" +var _Version = "Release 4.4.3" // Version returns the version of the obitools package. // // No parameters. // Returns a string representing the version of the obitools package. func VersionString() string { - return fmt.Sprintf("%s (%s)", _Version, _Commit) + return _Version } diff --git a/version.txt b/version.txt new file mode 100644 index 0000000..9e3a933 --- /dev/null +++ b/version.txt @@ -0,0 +1 @@ +4.4.3