2026-02-05 14:41:41 +01:00
|
|
|
|
package obikmer
|
|
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
|
"fmt"
|
|
|
|
|
|
|
|
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
|
|
|
|
|
"github.com/RoaringBitmap/roaring/roaring64"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// KmerSet wraps a set of k-mers stored in a Roaring Bitmap
|
|
|
|
|
|
// Provides utility methods for manipulating k-mer sets
|
2026-02-05 14:41:41 +01:00
|
|
|
|
type KmerSet struct {
|
2026-02-05 16:35:38 +01:00
|
|
|
|
id string // Unique identifier of the KmerSet
|
|
|
|
|
|
k int // Size of k-mers (immutable)
|
|
|
|
|
|
bitmap *roaring64.Bitmap // Bitmap containing the k-mers
|
|
|
|
|
|
Metadata map[string]interface{} // User metadata (key=atomic value)
|
2026-02-05 14:41:41 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// NewKmerSet creates a new empty KmerSet
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func NewKmerSet(k int) *KmerSet {
|
|
|
|
|
|
return &KmerSet{
|
2026-02-05 15:32:19 +01:00
|
|
|
|
k: k,
|
2026-02-05 15:02:27 +01:00
|
|
|
|
bitmap: roaring64.New(),
|
|
|
|
|
|
Metadata: make(map[string]interface{}),
|
2026-02-05 14:41:41 +01:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// NewKmerSetFromBitmap creates a KmerSet from an existing bitmap
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func NewKmerSetFromBitmap(k int, bitmap *roaring64.Bitmap) *KmerSet {
|
|
|
|
|
|
return &KmerSet{
|
2026-02-05 15:32:19 +01:00
|
|
|
|
k: k,
|
2026-02-05 15:02:27 +01:00
|
|
|
|
bitmap: bitmap,
|
|
|
|
|
|
Metadata: make(map[string]interface{}),
|
2026-02-05 14:41:41 +01:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// K returns the size of k-mers (immutable)
|
2026-02-05 15:32:19 +01:00
|
|
|
|
func (ks *KmerSet) K() int {
|
|
|
|
|
|
return ks.k
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// AddKmerCode adds an encoded k-mer to the set
|
2026-02-05 15:51:44 +01:00
|
|
|
|
func (ks *KmerSet) AddKmerCode(kmer uint64) {
|
2026-02-05 14:41:41 +01:00
|
|
|
|
ks.bitmap.Add(kmer)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// AddCanonicalKmerCode adds an encoded canonical k-mer to the set
|
2026-02-05 16:14:24 +01:00
|
|
|
|
func (ks *KmerSet) AddCanonicalKmerCode(kmer uint64) {
|
|
|
|
|
|
canonical := CanonicalKmer(kmer, ks.k)
|
2026-02-05 15:51:44 +01:00
|
|
|
|
ks.bitmap.Add(canonical)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// AddKmer adds a k-mer to the set by encoding the sequence
|
|
|
|
|
|
// The sequence must have exactly k nucleotides
|
|
|
|
|
|
// Zero-allocation: encodes directly without creating an intermediate slice
|
2026-02-05 15:51:44 +01:00
|
|
|
|
func (ks *KmerSet) AddKmer(seq []byte) {
|
|
|
|
|
|
kmer := EncodeKmer(seq, ks.k)
|
|
|
|
|
|
ks.bitmap.Add(kmer)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// AddCanonicalKmer adds a canonical k-mer to the set by encoding the sequence
|
|
|
|
|
|
// The sequence must have exactly k nucleotides
|
|
|
|
|
|
// Zero-allocation: encodes directly in canonical form without creating an intermediate slice
|
2026-02-05 16:14:24 +01:00
|
|
|
|
func (ks *KmerSet) AddCanonicalKmer(seq []byte) {
|
|
|
|
|
|
canonical := EncodeCanonicalKmer(seq, ks.k)
|
2026-02-05 15:51:44 +01:00
|
|
|
|
ks.bitmap.Add(canonical)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// AddSequence adds all k-mers from a sequence to the set
|
|
|
|
|
|
// Uses an iterator to avoid allocating an intermediate vector
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) {
|
|
|
|
|
|
rawSeq := seq.Sequence()
|
2026-02-05 16:14:24 +01:00
|
|
|
|
for canonical := range IterCanonicalKmers(rawSeq, ks.k) {
|
2026-02-05 14:41:41 +01:00
|
|
|
|
ks.bitmap.Add(canonical)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// AddSequences adds all k-mers from multiple sequences in batch
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func (ks *KmerSet) AddSequences(sequences *obiseq.BioSequenceSlice) {
|
|
|
|
|
|
for _, seq := range *sequences {
|
|
|
|
|
|
ks.AddSequence(seq)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-09 17:50:33 +01:00
|
|
|
|
// AddSequenceSlice adds all k-mers from a slice of sequences
|
|
|
|
|
|
func (ks *KmerSet) AddSequenceSlice(sequences *obiseq.BioSequenceSlice) {
|
|
|
|
|
|
for _, seq := range *sequences {
|
|
|
|
|
|
ks.AddSequence(seq)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// Contains checks if a k-mer is in the set
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func (ks *KmerSet) Contains(kmer uint64) bool {
|
|
|
|
|
|
return ks.bitmap.Contains(kmer)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// Len returns the number of k-mers in the set
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func (ks *KmerSet) Len() uint64 {
|
|
|
|
|
|
return ks.bitmap.GetCardinality()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// MemoryUsage returns memory usage in bytes
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func (ks *KmerSet) MemoryUsage() uint64 {
|
|
|
|
|
|
return ks.bitmap.GetSizeInBytes()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// Clear empties the set
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func (ks *KmerSet) Clear() {
|
|
|
|
|
|
ks.bitmap.Clear()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// Copy creates a copy of the set (consistent with BioSequence.Copy)
|
2026-02-05 15:32:19 +01:00
|
|
|
|
func (ks *KmerSet) Copy() *KmerSet {
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// Copy metadata
|
2026-02-05 15:02:27 +01:00
|
|
|
|
metadata := make(map[string]interface{}, len(ks.Metadata))
|
|
|
|
|
|
for k, v := range ks.Metadata {
|
|
|
|
|
|
metadata[k] = v
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 14:41:41 +01:00
|
|
|
|
return &KmerSet{
|
2026-02-05 15:32:19 +01:00
|
|
|
|
id: ks.id,
|
|
|
|
|
|
k: ks.k,
|
2026-02-05 15:02:27 +01:00
|
|
|
|
bitmap: ks.bitmap.Clone(),
|
|
|
|
|
|
Metadata: metadata,
|
2026-02-05 14:41:41 +01:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// Id returns the identifier of the KmerSet (consistent with BioSequence.Id)
|
2026-02-05 15:32:19 +01:00
|
|
|
|
func (ks *KmerSet) Id() string {
|
|
|
|
|
|
return ks.id
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// SetId sets the identifier of the KmerSet (consistent with BioSequence.SetId)
|
2026-02-05 15:32:19 +01:00
|
|
|
|
func (ks *KmerSet) SetId(id string) {
|
|
|
|
|
|
ks.id = id
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// Union returns the union of this set with another
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func (ks *KmerSet) Union(other *KmerSet) *KmerSet {
|
2026-02-05 15:32:19 +01:00
|
|
|
|
if ks.k != other.k {
|
|
|
|
|
|
panic(fmt.Sprintf("Cannot union KmerSets with different k values: %d vs %d", ks.k, other.k))
|
2026-02-05 14:41:41 +01:00
|
|
|
|
}
|
|
|
|
|
|
result := ks.bitmap.Clone()
|
|
|
|
|
|
result.Or(other.bitmap)
|
2026-02-05 15:32:19 +01:00
|
|
|
|
return NewKmerSetFromBitmap(ks.k, result)
|
2026-02-05 14:41:41 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// Intersect returns the intersection of this set with another
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func (ks *KmerSet) Intersect(other *KmerSet) *KmerSet {
|
2026-02-05 15:32:19 +01:00
|
|
|
|
if ks.k != other.k {
|
|
|
|
|
|
panic(fmt.Sprintf("Cannot intersect KmerSets with different k values: %d vs %d", ks.k, other.k))
|
2026-02-05 14:41:41 +01:00
|
|
|
|
}
|
|
|
|
|
|
result := ks.bitmap.Clone()
|
|
|
|
|
|
result.And(other.bitmap)
|
2026-02-05 15:32:19 +01:00
|
|
|
|
return NewKmerSetFromBitmap(ks.k, result)
|
2026-02-05 14:41:41 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// Difference returns the difference of this set with another (this - other)
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func (ks *KmerSet) Difference(other *KmerSet) *KmerSet {
|
2026-02-05 15:32:19 +01:00
|
|
|
|
if ks.k != other.k {
|
|
|
|
|
|
panic(fmt.Sprintf("Cannot subtract KmerSets with different k values: %d vs %d", ks.k, other.k))
|
2026-02-05 14:41:41 +01:00
|
|
|
|
}
|
|
|
|
|
|
result := ks.bitmap.Clone()
|
|
|
|
|
|
result.AndNot(other.bitmap)
|
2026-02-05 15:32:19 +01:00
|
|
|
|
return NewKmerSetFromBitmap(ks.k, result)
|
2026-02-05 14:41:41 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 17:38:47 +01:00
|
|
|
|
// JaccardDistance computes the Jaccard distance between two KmerSets.
|
|
|
|
|
|
// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A ∪ B|)
|
|
|
|
|
|
// where A and B are the two sets.
|
|
|
|
|
|
//
|
|
|
|
|
|
// Returns:
|
|
|
|
|
|
// - 0.0 when sets are identical (distance = 0, similarity = 1)
|
|
|
|
|
|
// - 1.0 when sets are completely disjoint (distance = 1, similarity = 0)
|
|
|
|
|
|
// - 1.0 when both sets are empty (by convention)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
|
|
|
|
|
|
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
|
|
|
|
|
|
func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 {
|
|
|
|
|
|
if ks.k != other.k {
|
|
|
|
|
|
panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k))
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Compute intersection cardinality
|
|
|
|
|
|
intersectionCard := ks.bitmap.AndCardinality(other.bitmap)
|
|
|
|
|
|
|
|
|
|
|
|
// Compute union cardinality
|
|
|
|
|
|
unionCard := ks.bitmap.OrCardinality(other.bitmap)
|
|
|
|
|
|
|
|
|
|
|
|
// If union is empty, both sets are empty - return 1.0 by convention
|
|
|
|
|
|
if unionCard == 0 {
|
|
|
|
|
|
return 1.0
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Jaccard similarity = |A ∩ B| / |A ∪ B|
|
|
|
|
|
|
similarity := float64(intersectionCard) / float64(unionCard)
|
|
|
|
|
|
|
|
|
|
|
|
// Jaccard distance = 1 - similarity
|
|
|
|
|
|
return 1.0 - similarity
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets.
|
|
|
|
|
|
// The Jaccard similarity is defined as: |A ∩ B| / |A ∪ B|
|
|
|
|
|
|
//
|
|
|
|
|
|
// Returns:
|
|
|
|
|
|
// - 1.0 when sets are identical (maximum similarity)
|
|
|
|
|
|
// - 0.0 when sets are completely disjoint (no similarity)
|
|
|
|
|
|
// - 0.0 when both sets are empty (by convention)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
|
|
|
|
|
|
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
|
|
|
|
|
|
func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 {
|
|
|
|
|
|
return 1.0 - ks.JaccardDistance(other)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// Iterator returns an iterator over all k-mers in the set
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func (ks *KmerSet) Iterator() roaring64.IntIterable64 {
|
|
|
|
|
|
return ks.bitmap.Iterator()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
|
// Bitmap returns the underlying bitmap (for compatibility)
|
2026-02-05 14:41:41 +01:00
|
|
|
|
func (ks *KmerSet) Bitmap() *roaring64.Bitmap {
|
|
|
|
|
|
return ks.bitmap
|
|
|
|
|
|
}
|