2026-02-05 14:41:41 +01:00
|
|
|
package obikmer
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"fmt"
|
|
|
|
|
|
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
|
|
|
|
"github.com/RoaringBitmap/roaring/roaring64"
|
|
|
|
|
)
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// KmerSet wraps a set of k-mers stored in a Roaring Bitmap
|
|
|
|
|
// Provides utility methods for manipulating k-mer sets
|
2026-02-05 14:41:41 +01:00
|
|
|
type KmerSet struct {
|
2026-02-05 16:35:38 +01:00
|
|
|
id string // Unique identifier of the KmerSet
|
|
|
|
|
k int // Size of k-mers (immutable)
|
|
|
|
|
bitmap *roaring64.Bitmap // Bitmap containing the k-mers
|
|
|
|
|
Metadata map[string]interface{} // User metadata (key=atomic value)
|
2026-02-05 14:41:41 +01:00
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// NewKmerSet creates a new empty KmerSet
|
2026-02-05 14:41:41 +01:00
|
|
|
func NewKmerSet(k int) *KmerSet {
|
|
|
|
|
return &KmerSet{
|
2026-02-05 15:32:19 +01:00
|
|
|
k: k,
|
2026-02-05 15:02:27 +01:00
|
|
|
bitmap: roaring64.New(),
|
|
|
|
|
Metadata: make(map[string]interface{}),
|
2026-02-05 14:41:41 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// NewKmerSetFromBitmap creates a KmerSet from an existing bitmap
|
2026-02-05 14:41:41 +01:00
|
|
|
func NewKmerSetFromBitmap(k int, bitmap *roaring64.Bitmap) *KmerSet {
|
|
|
|
|
return &KmerSet{
|
2026-02-05 15:32:19 +01:00
|
|
|
k: k,
|
2026-02-05 15:02:27 +01:00
|
|
|
bitmap: bitmap,
|
|
|
|
|
Metadata: make(map[string]interface{}),
|
2026-02-05 14:41:41 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// K returns the size of k-mers (immutable)
|
2026-02-05 15:32:19 +01:00
|
|
|
func (ks *KmerSet) K() int {
|
|
|
|
|
return ks.k
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// AddKmerCode adds an encoded k-mer to the set
|
2026-02-05 15:51:44 +01:00
|
|
|
func (ks *KmerSet) AddKmerCode(kmer uint64) {
|
2026-02-05 14:41:41 +01:00
|
|
|
ks.bitmap.Add(kmer)
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// AddCanonicalKmerCode adds an encoded canonical k-mer to the set
|
2026-02-05 16:14:24 +01:00
|
|
|
func (ks *KmerSet) AddCanonicalKmerCode(kmer uint64) {
|
|
|
|
|
canonical := CanonicalKmer(kmer, ks.k)
|
2026-02-05 15:51:44 +01:00
|
|
|
ks.bitmap.Add(canonical)
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// AddKmer adds a k-mer to the set by encoding the sequence
|
|
|
|
|
// The sequence must have exactly k nucleotides
|
|
|
|
|
// Zero-allocation: encodes directly without creating an intermediate slice
|
2026-02-05 15:51:44 +01:00
|
|
|
func (ks *KmerSet) AddKmer(seq []byte) {
|
|
|
|
|
kmer := EncodeKmer(seq, ks.k)
|
|
|
|
|
ks.bitmap.Add(kmer)
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// AddCanonicalKmer adds a canonical k-mer to the set by encoding the sequence
|
|
|
|
|
// The sequence must have exactly k nucleotides
|
|
|
|
|
// Zero-allocation: encodes directly in canonical form without creating an intermediate slice
|
2026-02-05 16:14:24 +01:00
|
|
|
func (ks *KmerSet) AddCanonicalKmer(seq []byte) {
|
|
|
|
|
canonical := EncodeCanonicalKmer(seq, ks.k)
|
2026-02-05 15:51:44 +01:00
|
|
|
ks.bitmap.Add(canonical)
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// AddSequence adds all k-mers from a sequence to the set
|
|
|
|
|
// Uses an iterator to avoid allocating an intermediate vector
|
2026-02-05 14:41:41 +01:00
|
|
|
func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) {
|
|
|
|
|
rawSeq := seq.Sequence()
|
2026-02-05 16:14:24 +01:00
|
|
|
for canonical := range IterCanonicalKmers(rawSeq, ks.k) {
|
2026-02-05 14:41:41 +01:00
|
|
|
ks.bitmap.Add(canonical)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// AddSequences adds all k-mers from multiple sequences in batch
|
2026-02-05 14:41:41 +01:00
|
|
|
func (ks *KmerSet) AddSequences(sequences *obiseq.BioSequenceSlice) {
|
|
|
|
|
for _, seq := range *sequences {
|
|
|
|
|
ks.AddSequence(seq)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// Contains checks if a k-mer is in the set
|
2026-02-05 14:41:41 +01:00
|
|
|
func (ks *KmerSet) Contains(kmer uint64) bool {
|
|
|
|
|
return ks.bitmap.Contains(kmer)
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// Len returns the number of k-mers in the set
|
2026-02-05 14:41:41 +01:00
|
|
|
func (ks *KmerSet) Len() uint64 {
|
|
|
|
|
return ks.bitmap.GetCardinality()
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// MemoryUsage returns memory usage in bytes
|
2026-02-05 14:41:41 +01:00
|
|
|
func (ks *KmerSet) MemoryUsage() uint64 {
|
|
|
|
|
return ks.bitmap.GetSizeInBytes()
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// Clear empties the set
|
2026-02-05 14:41:41 +01:00
|
|
|
func (ks *KmerSet) Clear() {
|
|
|
|
|
ks.bitmap.Clear()
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// Copy creates a copy of the set (consistent with BioSequence.Copy)
|
2026-02-05 15:32:19 +01:00
|
|
|
func (ks *KmerSet) Copy() *KmerSet {
|
2026-02-05 16:35:38 +01:00
|
|
|
// Copy metadata
|
2026-02-05 15:02:27 +01:00
|
|
|
metadata := make(map[string]interface{}, len(ks.Metadata))
|
|
|
|
|
for k, v := range ks.Metadata {
|
|
|
|
|
metadata[k] = v
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 14:41:41 +01:00
|
|
|
return &KmerSet{
|
2026-02-05 15:32:19 +01:00
|
|
|
id: ks.id,
|
|
|
|
|
k: ks.k,
|
2026-02-05 15:02:27 +01:00
|
|
|
bitmap: ks.bitmap.Clone(),
|
|
|
|
|
Metadata: metadata,
|
2026-02-05 14:41:41 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// Id returns the identifier of the KmerSet (consistent with BioSequence.Id)
|
2026-02-05 15:32:19 +01:00
|
|
|
func (ks *KmerSet) Id() string {
|
|
|
|
|
return ks.id
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// SetId sets the identifier of the KmerSet (consistent with BioSequence.SetId)
|
2026-02-05 15:32:19 +01:00
|
|
|
func (ks *KmerSet) SetId(id string) {
|
|
|
|
|
ks.id = id
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// Union returns the union of this set with another
|
2026-02-05 14:41:41 +01:00
|
|
|
func (ks *KmerSet) Union(other *KmerSet) *KmerSet {
|
2026-02-05 15:32:19 +01:00
|
|
|
if ks.k != other.k {
|
|
|
|
|
panic(fmt.Sprintf("Cannot union KmerSets with different k values: %d vs %d", ks.k, other.k))
|
2026-02-05 14:41:41 +01:00
|
|
|
}
|
|
|
|
|
result := ks.bitmap.Clone()
|
|
|
|
|
result.Or(other.bitmap)
|
2026-02-05 15:32:19 +01:00
|
|
|
return NewKmerSetFromBitmap(ks.k, result)
|
2026-02-05 14:41:41 +01:00
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// Intersect returns the intersection of this set with another
|
2026-02-05 14:41:41 +01:00
|
|
|
func (ks *KmerSet) Intersect(other *KmerSet) *KmerSet {
|
2026-02-05 15:32:19 +01:00
|
|
|
if ks.k != other.k {
|
|
|
|
|
panic(fmt.Sprintf("Cannot intersect KmerSets with different k values: %d vs %d", ks.k, other.k))
|
2026-02-05 14:41:41 +01:00
|
|
|
}
|
|
|
|
|
result := ks.bitmap.Clone()
|
|
|
|
|
result.And(other.bitmap)
|
2026-02-05 15:32:19 +01:00
|
|
|
return NewKmerSetFromBitmap(ks.k, result)
|
2026-02-05 14:41:41 +01:00
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// Difference returns the difference of this set with another (this - other)
|
2026-02-05 14:41:41 +01:00
|
|
|
func (ks *KmerSet) Difference(other *KmerSet) *KmerSet {
|
2026-02-05 15:32:19 +01:00
|
|
|
if ks.k != other.k {
|
|
|
|
|
panic(fmt.Sprintf("Cannot subtract KmerSets with different k values: %d vs %d", ks.k, other.k))
|
2026-02-05 14:41:41 +01:00
|
|
|
}
|
|
|
|
|
result := ks.bitmap.Clone()
|
|
|
|
|
result.AndNot(other.bitmap)
|
2026-02-05 15:32:19 +01:00
|
|
|
return NewKmerSetFromBitmap(ks.k, result)
|
2026-02-05 14:41:41 +01:00
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// Iterator returns an iterator over all k-mers in the set
|
2026-02-05 14:41:41 +01:00
|
|
|
func (ks *KmerSet) Iterator() roaring64.IntIterable64 {
|
|
|
|
|
return ks.bitmap.Iterator()
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-05 16:35:38 +01:00
|
|
|
// Bitmap returns the underlying bitmap (for compatibility)
|
2026-02-05 14:41:41 +01:00
|
|
|
func (ks *KmerSet) Bitmap() *roaring64.Bitmap {
|
|
|
|
|
return ks.bitmap
|
|
|
|
|
}
|