Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup

Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup

This commit introduces Jaccard distance and similarity methods for KmerSet and KmerSetGroup.

For KmerSet:
- Added JaccardDistance method to compute the Jaccard distance between two KmerSets
- Added JaccardSimilarity method to compute the Jaccard similarity between two KmerSets

For KmerSetGroup:
- Added JaccardDistanceMatrix method to compute a pairwise Jaccard distance matrix
- Added JaccardSimilarityMatrix method to compute a pairwise Jaccard similarity matrix

Also includes:
- New DistMatrix implementation in pkg/obidist for storing and computing distance/similarity matrices
- Updated version handling with bump-version target in Makefile
- Added tests for all new methods
This commit is contained in:
Eric Coissac
2026-02-05 17:38:47 +01:00
parent aa2e94dd6f
commit e3c41fc11b
9 changed files with 1328 additions and 28 deletions

View File

@@ -158,6 +158,54 @@ func (ks *KmerSet) Difference(other *KmerSet) *KmerSet {
return NewKmerSetFromBitmap(ks.k, result)
}
// JaccardDistance computes the Jaccard distance between two KmerSets.
// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A B|)
// where A and B are the two sets.
//
// Returns:
// - 0.0 when sets are identical (distance = 0, similarity = 1)
// - 1.0 when sets are completely disjoint (distance = 1, similarity = 0)
// - 1.0 when both sets are empty (by convention)
//
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 {
if ks.k != other.k {
panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k))
}
// Compute intersection cardinality
intersectionCard := ks.bitmap.AndCardinality(other.bitmap)
// Compute union cardinality
unionCard := ks.bitmap.OrCardinality(other.bitmap)
// If union is empty, both sets are empty - return 1.0 by convention
if unionCard == 0 {
return 1.0
}
// Jaccard similarity = |A ∩ B| / |A B|
similarity := float64(intersectionCard) / float64(unionCard)
// Jaccard distance = 1 - similarity
return 1.0 - similarity
}
// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets.
// The Jaccard similarity is defined as: |A ∩ B| / |A B|
//
// Returns:
// - 1.0 when sets are identical (maximum similarity)
// - 0.0 when sets are completely disjoint (no similarity)
// - 0.0 when both sets are empty (by convention)
//
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 {
return 1.0 - ks.JaccardDistance(other)
}
// Iterator returns an iterator over all k-mers in the set
func (ks *KmerSet) Iterator() roaring64.IntIterable64 {
return ks.bitmap.Iterator()