Files
obitools4/pkg/obikmer/kmer_set.go

225 lines
6.7 KiB
Go
Raw Normal View History

package obikmer
import (
"fmt"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/RoaringBitmap/roaring/roaring64"
)
// KmerSet wraps a set of k-mers stored in a Roaring Bitmap
// Provides utility methods for manipulating k-mer sets
type KmerSet struct {
id string // Unique identifier of the KmerSet
k int // Size of k-mers (immutable)
bitmap *roaring64.Bitmap // Bitmap containing the k-mers
Metadata map[string]interface{} // User metadata (key=atomic value)
}
// NewKmerSet creates a new empty KmerSet
func NewKmerSet(k int) *KmerSet {
return &KmerSet{
k: k,
bitmap: roaring64.New(),
Metadata: make(map[string]interface{}),
}
}
// NewKmerSetFromBitmap creates a KmerSet from an existing bitmap
func NewKmerSetFromBitmap(k int, bitmap *roaring64.Bitmap) *KmerSet {
return &KmerSet{
k: k,
bitmap: bitmap,
Metadata: make(map[string]interface{}),
}
}
// K returns the size of k-mers (immutable)
func (ks *KmerSet) K() int {
return ks.k
}
// AddKmerCode adds an encoded k-mer to the set
func (ks *KmerSet) AddKmerCode(kmer uint64) {
ks.bitmap.Add(kmer)
}
// AddCanonicalKmerCode adds an encoded canonical k-mer to the set
func (ks *KmerSet) AddCanonicalKmerCode(kmer uint64) {
canonical := CanonicalKmer(kmer, ks.k)
ks.bitmap.Add(canonical)
}
// AddKmer adds a k-mer to the set by encoding the sequence
// The sequence must have exactly k nucleotides
// Zero-allocation: encodes directly without creating an intermediate slice
func (ks *KmerSet) AddKmer(seq []byte) {
kmer := EncodeKmer(seq, ks.k)
ks.bitmap.Add(kmer)
}
// AddCanonicalKmer adds a canonical k-mer to the set by encoding the sequence
// The sequence must have exactly k nucleotides
// Zero-allocation: encodes directly in canonical form without creating an intermediate slice
func (ks *KmerSet) AddCanonicalKmer(seq []byte) {
canonical := EncodeCanonicalKmer(seq, ks.k)
ks.bitmap.Add(canonical)
}
// AddSequence adds all k-mers from a sequence to the set
// Uses an iterator to avoid allocating an intermediate vector
func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) {
rawSeq := seq.Sequence()
for canonical := range IterCanonicalKmers(rawSeq, ks.k) {
ks.bitmap.Add(canonical)
}
}
// AddSequences adds all k-mers from multiple sequences in batch
func (ks *KmerSet) AddSequences(sequences *obiseq.BioSequenceSlice) {
for _, seq := range *sequences {
ks.AddSequence(seq)
}
}
// AddSequenceSlice adds all k-mers from a slice of sequences
func (ks *KmerSet) AddSequenceSlice(sequences *obiseq.BioSequenceSlice) {
for _, seq := range *sequences {
ks.AddSequence(seq)
}
}
// Contains checks if a k-mer is in the set
func (ks *KmerSet) Contains(kmer uint64) bool {
return ks.bitmap.Contains(kmer)
}
// Len returns the number of k-mers in the set
func (ks *KmerSet) Len() uint64 {
return ks.bitmap.GetCardinality()
}
// MemoryUsage returns memory usage in bytes
func (ks *KmerSet) MemoryUsage() uint64 {
return ks.bitmap.GetSizeInBytes()
}
// Clear empties the set
func (ks *KmerSet) Clear() {
ks.bitmap.Clear()
}
// Copy creates a copy of the set (consistent with BioSequence.Copy)
func (ks *KmerSet) Copy() *KmerSet {
// Copy metadata
metadata := make(map[string]interface{}, len(ks.Metadata))
for k, v := range ks.Metadata {
metadata[k] = v
}
return &KmerSet{
id: ks.id,
k: ks.k,
bitmap: ks.bitmap.Clone(),
Metadata: metadata,
}
}
// Id returns the identifier of the KmerSet (consistent with BioSequence.Id)
func (ks *KmerSet) Id() string {
return ks.id
}
// SetId sets the identifier of the KmerSet (consistent with BioSequence.SetId)
func (ks *KmerSet) SetId(id string) {
ks.id = id
}
// Union returns the union of this set with another
func (ks *KmerSet) Union(other *KmerSet) *KmerSet {
if ks.k != other.k {
panic(fmt.Sprintf("Cannot union KmerSets with different k values: %d vs %d", ks.k, other.k))
}
result := ks.bitmap.Clone()
result.Or(other.bitmap)
return NewKmerSetFromBitmap(ks.k, result)
}
// Intersect returns the intersection of this set with another
func (ks *KmerSet) Intersect(other *KmerSet) *KmerSet {
if ks.k != other.k {
panic(fmt.Sprintf("Cannot intersect KmerSets with different k values: %d vs %d", ks.k, other.k))
}
result := ks.bitmap.Clone()
result.And(other.bitmap)
return NewKmerSetFromBitmap(ks.k, result)
}
// Difference returns the difference of this set with another (this - other)
func (ks *KmerSet) Difference(other *KmerSet) *KmerSet {
if ks.k != other.k {
panic(fmt.Sprintf("Cannot subtract KmerSets with different k values: %d vs %d", ks.k, other.k))
}
result := ks.bitmap.Clone()
result.AndNot(other.bitmap)
return NewKmerSetFromBitmap(ks.k, result)
}
// JaccardDistance computes the Jaccard distance between two KmerSets.
// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A B|)
// where A and B are the two sets.
//
// Returns:
// - 0.0 when sets are identical (distance = 0, similarity = 1)
// - 1.0 when sets are completely disjoint (distance = 1, similarity = 0)
// - 1.0 when both sets are empty (by convention)
//
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 {
if ks.k != other.k {
panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k))
}
// Compute intersection cardinality
intersectionCard := ks.bitmap.AndCardinality(other.bitmap)
// Compute union cardinality
unionCard := ks.bitmap.OrCardinality(other.bitmap)
// If union is empty, both sets are empty - return 1.0 by convention
if unionCard == 0 {
return 1.0
}
// Jaccard similarity = |A ∩ B| / |A B|
similarity := float64(intersectionCard) / float64(unionCard)
// Jaccard distance = 1 - similarity
return 1.0 - similarity
}
// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets.
// The Jaccard similarity is defined as: |A ∩ B| / |A B|
//
// Returns:
// - 1.0 when sets are identical (maximum similarity)
// - 0.0 when sets are completely disjoint (no similarity)
// - 0.0 when both sets are empty (by convention)
//
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 {
return 1.0 - ks.JaccardDistance(other)
}
// Iterator returns an iterator over all k-mers in the set
func (ks *KmerSet) Iterator() roaring64.IntIterable64 {
return ks.bitmap.Iterator()
}
// Bitmap returns the underlying bitmap (for compatibility)
func (ks *KmerSet) Bitmap() *roaring64.Bitmap {
return ks.bitmap
}