mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Add Jaccard distance and similarity computations for KmerSet and KmerSetGroup This commit introduces Jaccard distance and similarity methods for KmerSet and KmerSetGroup. For KmerSet: - Added JaccardDistance method to compute the Jaccard distance between two KmerSets - Added JaccardSimilarity method to compute the Jaccard similarity between two KmerSets For KmerSetGroup: - Added JaccardDistanceMatrix method to compute a pairwise Jaccard distance matrix - Added JaccardSimilarityMatrix method to compute a pairwise Jaccard similarity matrix Also includes: - New DistMatrix implementation in pkg/obidist for storing and computing distance/similarity matrices - Updated version handling with bump-version target in Makefile - Added tests for all new methods
218 lines
6.5 KiB
Go
218 lines
6.5 KiB
Go
package obikmer
|
||
|
||
import (
|
||
"fmt"
|
||
|
||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||
"github.com/RoaringBitmap/roaring/roaring64"
|
||
)
|
||
|
||
// KmerSet wraps a set of k-mers stored in a Roaring Bitmap
|
||
// Provides utility methods for manipulating k-mer sets
|
||
type KmerSet struct {
|
||
id string // Unique identifier of the KmerSet
|
||
k int // Size of k-mers (immutable)
|
||
bitmap *roaring64.Bitmap // Bitmap containing the k-mers
|
||
Metadata map[string]interface{} // User metadata (key=atomic value)
|
||
}
|
||
|
||
// NewKmerSet creates a new empty KmerSet
|
||
func NewKmerSet(k int) *KmerSet {
|
||
return &KmerSet{
|
||
k: k,
|
||
bitmap: roaring64.New(),
|
||
Metadata: make(map[string]interface{}),
|
||
}
|
||
}
|
||
|
||
// NewKmerSetFromBitmap creates a KmerSet from an existing bitmap
|
||
func NewKmerSetFromBitmap(k int, bitmap *roaring64.Bitmap) *KmerSet {
|
||
return &KmerSet{
|
||
k: k,
|
||
bitmap: bitmap,
|
||
Metadata: make(map[string]interface{}),
|
||
}
|
||
}
|
||
|
||
// K returns the size of k-mers (immutable)
|
||
func (ks *KmerSet) K() int {
|
||
return ks.k
|
||
}
|
||
|
||
// AddKmerCode adds an encoded k-mer to the set
|
||
func (ks *KmerSet) AddKmerCode(kmer uint64) {
|
||
ks.bitmap.Add(kmer)
|
||
}
|
||
|
||
// AddCanonicalKmerCode adds an encoded canonical k-mer to the set
|
||
func (ks *KmerSet) AddCanonicalKmerCode(kmer uint64) {
|
||
canonical := CanonicalKmer(kmer, ks.k)
|
||
ks.bitmap.Add(canonical)
|
||
}
|
||
|
||
// AddKmer adds a k-mer to the set by encoding the sequence
|
||
// The sequence must have exactly k nucleotides
|
||
// Zero-allocation: encodes directly without creating an intermediate slice
|
||
func (ks *KmerSet) AddKmer(seq []byte) {
|
||
kmer := EncodeKmer(seq, ks.k)
|
||
ks.bitmap.Add(kmer)
|
||
}
|
||
|
||
// AddCanonicalKmer adds a canonical k-mer to the set by encoding the sequence
|
||
// The sequence must have exactly k nucleotides
|
||
// Zero-allocation: encodes directly in canonical form without creating an intermediate slice
|
||
func (ks *KmerSet) AddCanonicalKmer(seq []byte) {
|
||
canonical := EncodeCanonicalKmer(seq, ks.k)
|
||
ks.bitmap.Add(canonical)
|
||
}
|
||
|
||
// AddSequence adds all k-mers from a sequence to the set
|
||
// Uses an iterator to avoid allocating an intermediate vector
|
||
func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) {
|
||
rawSeq := seq.Sequence()
|
||
for canonical := range IterCanonicalKmers(rawSeq, ks.k) {
|
||
ks.bitmap.Add(canonical)
|
||
}
|
||
}
|
||
|
||
// AddSequences adds all k-mers from multiple sequences in batch
|
||
func (ks *KmerSet) AddSequences(sequences *obiseq.BioSequenceSlice) {
|
||
for _, seq := range *sequences {
|
||
ks.AddSequence(seq)
|
||
}
|
||
}
|
||
|
||
// Contains checks if a k-mer is in the set
|
||
func (ks *KmerSet) Contains(kmer uint64) bool {
|
||
return ks.bitmap.Contains(kmer)
|
||
}
|
||
|
||
// Len returns the number of k-mers in the set
|
||
func (ks *KmerSet) Len() uint64 {
|
||
return ks.bitmap.GetCardinality()
|
||
}
|
||
|
||
// MemoryUsage returns memory usage in bytes
|
||
func (ks *KmerSet) MemoryUsage() uint64 {
|
||
return ks.bitmap.GetSizeInBytes()
|
||
}
|
||
|
||
// Clear empties the set
|
||
func (ks *KmerSet) Clear() {
|
||
ks.bitmap.Clear()
|
||
}
|
||
|
||
// Copy creates a copy of the set (consistent with BioSequence.Copy)
|
||
func (ks *KmerSet) Copy() *KmerSet {
|
||
// Copy metadata
|
||
metadata := make(map[string]interface{}, len(ks.Metadata))
|
||
for k, v := range ks.Metadata {
|
||
metadata[k] = v
|
||
}
|
||
|
||
return &KmerSet{
|
||
id: ks.id,
|
||
k: ks.k,
|
||
bitmap: ks.bitmap.Clone(),
|
||
Metadata: metadata,
|
||
}
|
||
}
|
||
|
||
// Id returns the identifier of the KmerSet (consistent with BioSequence.Id)
|
||
func (ks *KmerSet) Id() string {
|
||
return ks.id
|
||
}
|
||
|
||
// SetId sets the identifier of the KmerSet (consistent with BioSequence.SetId)
|
||
func (ks *KmerSet) SetId(id string) {
|
||
ks.id = id
|
||
}
|
||
|
||
// Union returns the union of this set with another
|
||
func (ks *KmerSet) Union(other *KmerSet) *KmerSet {
|
||
if ks.k != other.k {
|
||
panic(fmt.Sprintf("Cannot union KmerSets with different k values: %d vs %d", ks.k, other.k))
|
||
}
|
||
result := ks.bitmap.Clone()
|
||
result.Or(other.bitmap)
|
||
return NewKmerSetFromBitmap(ks.k, result)
|
||
}
|
||
|
||
// Intersect returns the intersection of this set with another
|
||
func (ks *KmerSet) Intersect(other *KmerSet) *KmerSet {
|
||
if ks.k != other.k {
|
||
panic(fmt.Sprintf("Cannot intersect KmerSets with different k values: %d vs %d", ks.k, other.k))
|
||
}
|
||
result := ks.bitmap.Clone()
|
||
result.And(other.bitmap)
|
||
return NewKmerSetFromBitmap(ks.k, result)
|
||
}
|
||
|
||
// Difference returns the difference of this set with another (this - other)
|
||
func (ks *KmerSet) Difference(other *KmerSet) *KmerSet {
|
||
if ks.k != other.k {
|
||
panic(fmt.Sprintf("Cannot subtract KmerSets with different k values: %d vs %d", ks.k, other.k))
|
||
}
|
||
result := ks.bitmap.Clone()
|
||
result.AndNot(other.bitmap)
|
||
return NewKmerSetFromBitmap(ks.k, result)
|
||
}
|
||
|
||
// JaccardDistance computes the Jaccard distance between two KmerSets.
|
||
// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A ∪ B|)
|
||
// where A and B are the two sets.
|
||
//
|
||
// Returns:
|
||
// - 0.0 when sets are identical (distance = 0, similarity = 1)
|
||
// - 1.0 when sets are completely disjoint (distance = 1, similarity = 0)
|
||
// - 1.0 when both sets are empty (by convention)
|
||
//
|
||
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
|
||
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
|
||
func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 {
|
||
if ks.k != other.k {
|
||
panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k))
|
||
}
|
||
|
||
// Compute intersection cardinality
|
||
intersectionCard := ks.bitmap.AndCardinality(other.bitmap)
|
||
|
||
// Compute union cardinality
|
||
unionCard := ks.bitmap.OrCardinality(other.bitmap)
|
||
|
||
// If union is empty, both sets are empty - return 1.0 by convention
|
||
if unionCard == 0 {
|
||
return 1.0
|
||
}
|
||
|
||
// Jaccard similarity = |A ∩ B| / |A ∪ B|
|
||
similarity := float64(intersectionCard) / float64(unionCard)
|
||
|
||
// Jaccard distance = 1 - similarity
|
||
return 1.0 - similarity
|
||
}
|
||
|
||
// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets.
|
||
// The Jaccard similarity is defined as: |A ∩ B| / |A ∪ B|
|
||
//
|
||
// Returns:
|
||
// - 1.0 when sets are identical (maximum similarity)
|
||
// - 0.0 when sets are completely disjoint (no similarity)
|
||
// - 0.0 when both sets are empty (by convention)
|
||
//
|
||
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
|
||
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
|
||
func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 {
|
||
return 1.0 - ks.JaccardDistance(other)
|
||
}
|
||
|
||
// Iterator returns an iterator over all k-mers in the set
|
||
func (ks *KmerSet) Iterator() roaring64.IntIterable64 {
|
||
return ks.bitmap.Iterator()
|
||
}
|
||
|
||
// Bitmap returns the underlying bitmap (for compatibility)
|
||
func (ks *KmerSet) Bitmap() *roaring64.Bitmap {
|
||
return ks.bitmap
|
||
}
|