package obikmer import ( "fmt" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" ) // KmerSetGroup represents a vector of KmerSet // Used to manage multiple k-mer sets (for example, by frequency level) type KmerSetGroup struct { id string // Unique identifier of the KmerSetGroup k int // Size of k-mers (immutable) sets []*KmerSet // Vector of KmerSet Metadata map[string]interface{} // Group metadata (not individual sets) } // NewKmerSetGroup creates a new group of n KmerSets func NewKmerSetGroup(k int, n int) *KmerSetGroup { if n < 1 { panic("KmerSetGroup size must be >= 1") } sets := make([]*KmerSet, n) for i := range sets { sets[i] = NewKmerSet(k) } return &KmerSetGroup{ k: k, sets: sets, Metadata: make(map[string]interface{}), } } // K returns the size of k-mers (immutable) func (ksg *KmerSetGroup) K() int { return ksg.k } // Size returns the number of KmerSet in the group func (ksg *KmerSetGroup) Size() int { return len(ksg.sets) } // Get returns the KmerSet at the given index // Returns nil if the index is invalid func (ksg *KmerSetGroup) Get(index int) *KmerSet { if index < 0 || index >= len(ksg.sets) { return nil } return ksg.sets[index] } // Set replaces the KmerSet at the given index // Panics if the index is invalid or if k does not match func (ksg *KmerSetGroup) Set(index int, ks *KmerSet) { if index < 0 || index >= len(ksg.sets) { panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets))) } if ks.k != ksg.k { panic(fmt.Sprintf("KmerSet k mismatch: expected %d, got %d", ksg.k, ks.k)) } ksg.sets[index] = ks } // Len returns the number of k-mers in a specific KmerSet // Without argument: returns the number of k-mers in the last KmerSet // With argument index: returns the number of k-mers in the KmerSet at this index func (ksg *KmerSetGroup) Len(index ...int) uint64 { if len(index) == 0 { // Without argument: last KmerSet return ksg.sets[len(ksg.sets)-1].Len() } // With argument: specific KmerSet idx := index[0] if idx < 0 || idx >= len(ksg.sets) { return 0 } return ksg.sets[idx].Len() } // MemoryUsage returns the total memory usage in bytes func (ksg *KmerSetGroup) MemoryUsage() uint64 { total := uint64(0) for _, ks := range ksg.sets { total += ks.MemoryUsage() } return total } // Clear empties all KmerSet in the group func (ksg *KmerSetGroup) Clear() { for _, ks := range ksg.sets { ks.Clear() } } // Copy creates a complete copy of the group (consistent with BioSequence.Copy) func (ksg *KmerSetGroup) Copy() *KmerSetGroup { copiedSets := make([]*KmerSet, len(ksg.sets)) for i, ks := range ksg.sets { copiedSets[i] = ks.Copy() // Copy each KmerSet with its metadata } // Copy group metadata groupMetadata := make(map[string]interface{}, len(ksg.Metadata)) for k, v := range ksg.Metadata { groupMetadata[k] = v } return &KmerSetGroup{ id: ksg.id, k: ksg.k, sets: copiedSets, Metadata: groupMetadata, } } // Id returns the identifier of the KmerSetGroup (consistent with BioSequence.Id) func (ksg *KmerSetGroup) Id() string { return ksg.id } // SetId sets the identifier of the KmerSetGroup (consistent with BioSequence.SetId) func (ksg *KmerSetGroup) SetId(id string) { ksg.id = id } // AddSequence adds all k-mers from a sequence to a specific KmerSet func (ksg *KmerSetGroup) AddSequence(seq *obiseq.BioSequence, index int) { if index < 0 || index >= len(ksg.sets) { panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets))) } ksg.sets[index].AddSequence(seq) } // AddSequences adds all k-mers from multiple sequences to a specific KmerSet func (ksg *KmerSetGroup) AddSequences(sequences *obiseq.BioSequenceSlice, index int) { if index < 0 || index >= len(ksg.sets) { panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets))) } ksg.sets[index].AddSequences(sequences) } // Union returns the union of all KmerSet in the group // Optimization: starts from the largest set to minimize operations func (ksg *KmerSetGroup) Union() *KmerSet { if len(ksg.sets) == 0 { return NewKmerSet(ksg.k) } if len(ksg.sets) == 1 { return ksg.sets[0].Copy() } // Find the index of the largest set (the one with the most k-mers) maxIdx := 0 maxCard := ksg.sets[0].Len() for i := 1; i < len(ksg.sets); i++ { card := ksg.sets[i].Len() if card > maxCard { maxCard = card maxIdx = i } } // Copy the largest set and perform unions in-place result := ksg.sets[maxIdx].bitmap.Clone() for i := 0; i < len(ksg.sets); i++ { if i != maxIdx { result.Or(ksg.sets[i].bitmap) } } return NewKmerSetFromBitmap(ksg.k, result) } // Intersect returns the intersection of all KmerSet in the group // Optimization: starts from the smallest set to minimize operations func (ksg *KmerSetGroup) Intersect() *KmerSet { if len(ksg.sets) == 0 { return NewKmerSet(ksg.k) } if len(ksg.sets) == 1 { return ksg.sets[0].Copy() } // Find the index of the smallest set (the one with the fewest k-mers) minIdx := 0 minCard := ksg.sets[0].Len() for i := 1; i < len(ksg.sets); i++ { card := ksg.sets[i].Len() if card < minCard { minCard = card minIdx = i } } // Copy the smallest set and perform intersections in-place result := ksg.sets[minIdx].bitmap.Clone() for i := 0; i < len(ksg.sets); i++ { if i != minIdx { result.And(ksg.sets[i].bitmap) } } return NewKmerSetFromBitmap(ksg.k, result) } // Stats returns statistics for each KmerSet in the group type KmerSetGroupStats struct { K int Size int // Number of KmerSet TotalBytes uint64 // Total memory used Sets []KmerSetStats // Stats of each KmerSet } type KmerSetStats struct { Index int // Index of the KmerSet in the group Len uint64 // Number of k-mers SizeBytes uint64 // Size in bytes } func (ksg *KmerSetGroup) Stats() KmerSetGroupStats { stats := KmerSetGroupStats{ K: ksg.k, Size: len(ksg.sets), Sets: make([]KmerSetStats, len(ksg.sets)), } for i, ks := range ksg.sets { sizeBytes := ks.MemoryUsage() stats.Sets[i] = KmerSetStats{ Index: i, Len: ks.Len(), SizeBytes: sizeBytes, } stats.TotalBytes += sizeBytes } return stats } func (ksgs KmerSetGroupStats) String() string { result := fmt.Sprintf(`KmerSetGroup Statistics (k=%d, size=%d): Total memory: %.2f MB Set breakdown: `, ksgs.K, ksgs.Size, float64(ksgs.TotalBytes)/1024/1024) for _, set := range ksgs.Sets { result += fmt.Sprintf(" Set[%d]: %d k-mers (%.2f MB)\n", set.Index, set.Len, float64(set.SizeBytes)/1024/1024) } return result }