mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
This commit translates all French comments in the kmer filtering and set management code to English, improving code readability and maintainability for international collaborators.
263 lines
6.6 KiB
Go
263 lines
6.6 KiB
Go
package obikmer
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
|
)
|
|
|
|
// KmerSetGroup represents a vector of KmerSet
|
|
// Used to manage multiple k-mer sets (for example, by frequency level)
|
|
type KmerSetGroup struct {
|
|
id string // Unique identifier of the KmerSetGroup
|
|
k int // Size of k-mers (immutable)
|
|
sets []*KmerSet // Vector of KmerSet
|
|
Metadata map[string]interface{} // Group metadata (not individual sets)
|
|
}
|
|
|
|
// NewKmerSetGroup creates a new group of n KmerSets
|
|
func NewKmerSetGroup(k int, n int) *KmerSetGroup {
|
|
if n < 1 {
|
|
panic("KmerSetGroup size must be >= 1")
|
|
}
|
|
|
|
sets := make([]*KmerSet, n)
|
|
for i := range sets {
|
|
sets[i] = NewKmerSet(k)
|
|
}
|
|
|
|
return &KmerSetGroup{
|
|
k: k,
|
|
sets: sets,
|
|
Metadata: make(map[string]interface{}),
|
|
}
|
|
}
|
|
|
|
// K returns the size of k-mers (immutable)
|
|
func (ksg *KmerSetGroup) K() int {
|
|
return ksg.k
|
|
}
|
|
|
|
// Size returns the number of KmerSet in the group
|
|
func (ksg *KmerSetGroup) Size() int {
|
|
return len(ksg.sets)
|
|
}
|
|
|
|
// Get returns the KmerSet at the given index
|
|
// Returns nil if the index is invalid
|
|
func (ksg *KmerSetGroup) Get(index int) *KmerSet {
|
|
if index < 0 || index >= len(ksg.sets) {
|
|
return nil
|
|
}
|
|
return ksg.sets[index]
|
|
}
|
|
|
|
// Set replaces the KmerSet at the given index
|
|
// Panics if the index is invalid or if k does not match
|
|
func (ksg *KmerSetGroup) Set(index int, ks *KmerSet) {
|
|
if index < 0 || index >= len(ksg.sets) {
|
|
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
|
}
|
|
if ks.k != ksg.k {
|
|
panic(fmt.Sprintf("KmerSet k mismatch: expected %d, got %d", ksg.k, ks.k))
|
|
}
|
|
ksg.sets[index] = ks
|
|
}
|
|
|
|
// Len returns the number of k-mers in a specific KmerSet
|
|
// Without argument: returns the number of k-mers in the last KmerSet
|
|
// With argument index: returns the number of k-mers in the KmerSet at this index
|
|
func (ksg *KmerSetGroup) Len(index ...int) uint64 {
|
|
if len(index) == 0 {
|
|
// Without argument: last KmerSet
|
|
return ksg.sets[len(ksg.sets)-1].Len()
|
|
}
|
|
|
|
// With argument: specific KmerSet
|
|
idx := index[0]
|
|
if idx < 0 || idx >= len(ksg.sets) {
|
|
return 0
|
|
}
|
|
return ksg.sets[idx].Len()
|
|
}
|
|
|
|
// MemoryUsage returns the total memory usage in bytes
|
|
func (ksg *KmerSetGroup) MemoryUsage() uint64 {
|
|
total := uint64(0)
|
|
for _, ks := range ksg.sets {
|
|
total += ks.MemoryUsage()
|
|
}
|
|
return total
|
|
}
|
|
|
|
// Clear empties all KmerSet in the group
|
|
func (ksg *KmerSetGroup) Clear() {
|
|
for _, ks := range ksg.sets {
|
|
ks.Clear()
|
|
}
|
|
}
|
|
|
|
// Copy creates a complete copy of the group (consistent with BioSequence.Copy)
|
|
func (ksg *KmerSetGroup) Copy() *KmerSetGroup {
|
|
copiedSets := make([]*KmerSet, len(ksg.sets))
|
|
for i, ks := range ksg.sets {
|
|
copiedSets[i] = ks.Copy() // Copy each KmerSet with its metadata
|
|
}
|
|
|
|
// Copy group metadata
|
|
groupMetadata := make(map[string]interface{}, len(ksg.Metadata))
|
|
for k, v := range ksg.Metadata {
|
|
groupMetadata[k] = v
|
|
}
|
|
|
|
return &KmerSetGroup{
|
|
id: ksg.id,
|
|
k: ksg.k,
|
|
sets: copiedSets,
|
|
Metadata: groupMetadata,
|
|
}
|
|
}
|
|
|
|
// Id returns the identifier of the KmerSetGroup (consistent with BioSequence.Id)
|
|
func (ksg *KmerSetGroup) Id() string {
|
|
return ksg.id
|
|
}
|
|
|
|
// SetId sets the identifier of the KmerSetGroup (consistent with BioSequence.SetId)
|
|
func (ksg *KmerSetGroup) SetId(id string) {
|
|
ksg.id = id
|
|
}
|
|
|
|
// AddSequence adds all k-mers from a sequence to a specific KmerSet
|
|
func (ksg *KmerSetGroup) AddSequence(seq *obiseq.BioSequence, index int) {
|
|
if index < 0 || index >= len(ksg.sets) {
|
|
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
|
}
|
|
ksg.sets[index].AddSequence(seq)
|
|
}
|
|
|
|
// AddSequences adds all k-mers from multiple sequences to a specific KmerSet
|
|
func (ksg *KmerSetGroup) AddSequences(sequences *obiseq.BioSequenceSlice, index int) {
|
|
if index < 0 || index >= len(ksg.sets) {
|
|
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
|
}
|
|
ksg.sets[index].AddSequences(sequences)
|
|
}
|
|
|
|
// Union returns the union of all KmerSet in the group
|
|
// Optimization: starts from the largest set to minimize operations
|
|
func (ksg *KmerSetGroup) Union() *KmerSet {
|
|
if len(ksg.sets) == 0 {
|
|
return NewKmerSet(ksg.k)
|
|
}
|
|
|
|
if len(ksg.sets) == 1 {
|
|
return ksg.sets[0].Copy()
|
|
}
|
|
|
|
// Find the index of the largest set (the one with the most k-mers)
|
|
maxIdx := 0
|
|
maxCard := ksg.sets[0].Len()
|
|
for i := 1; i < len(ksg.sets); i++ {
|
|
card := ksg.sets[i].Len()
|
|
if card > maxCard {
|
|
maxCard = card
|
|
maxIdx = i
|
|
}
|
|
}
|
|
|
|
// Copy the largest set and perform unions in-place
|
|
result := ksg.sets[maxIdx].bitmap.Clone()
|
|
for i := 0; i < len(ksg.sets); i++ {
|
|
if i != maxIdx {
|
|
result.Or(ksg.sets[i].bitmap)
|
|
}
|
|
}
|
|
|
|
return NewKmerSetFromBitmap(ksg.k, result)
|
|
}
|
|
|
|
// Intersect returns the intersection of all KmerSet in the group
|
|
// Optimization: starts from the smallest set to minimize operations
|
|
func (ksg *KmerSetGroup) Intersect() *KmerSet {
|
|
if len(ksg.sets) == 0 {
|
|
return NewKmerSet(ksg.k)
|
|
}
|
|
|
|
if len(ksg.sets) == 1 {
|
|
return ksg.sets[0].Copy()
|
|
}
|
|
|
|
// Find the index of the smallest set (the one with the fewest k-mers)
|
|
minIdx := 0
|
|
minCard := ksg.sets[0].Len()
|
|
for i := 1; i < len(ksg.sets); i++ {
|
|
card := ksg.sets[i].Len()
|
|
if card < minCard {
|
|
minCard = card
|
|
minIdx = i
|
|
}
|
|
}
|
|
|
|
// Copy the smallest set and perform intersections in-place
|
|
result := ksg.sets[minIdx].bitmap.Clone()
|
|
for i := 0; i < len(ksg.sets); i++ {
|
|
if i != minIdx {
|
|
result.And(ksg.sets[i].bitmap)
|
|
}
|
|
}
|
|
|
|
return NewKmerSetFromBitmap(ksg.k, result)
|
|
}
|
|
|
|
// Stats returns statistics for each KmerSet in the group
|
|
type KmerSetGroupStats struct {
|
|
K int
|
|
Size int // Number of KmerSet
|
|
TotalBytes uint64 // Total memory used
|
|
Sets []KmerSetStats // Stats of each KmerSet
|
|
}
|
|
|
|
type KmerSetStats struct {
|
|
Index int // Index of the KmerSet in the group
|
|
Len uint64 // Number of k-mers
|
|
SizeBytes uint64 // Size in bytes
|
|
}
|
|
|
|
func (ksg *KmerSetGroup) Stats() KmerSetGroupStats {
|
|
stats := KmerSetGroupStats{
|
|
K: ksg.k,
|
|
Size: len(ksg.sets),
|
|
Sets: make([]KmerSetStats, len(ksg.sets)),
|
|
}
|
|
|
|
for i, ks := range ksg.sets {
|
|
sizeBytes := ks.MemoryUsage()
|
|
stats.Sets[i] = KmerSetStats{
|
|
Index: i,
|
|
Len: ks.Len(),
|
|
SizeBytes: sizeBytes,
|
|
}
|
|
stats.TotalBytes += sizeBytes
|
|
}
|
|
|
|
return stats
|
|
}
|
|
|
|
func (ksgs KmerSetGroupStats) String() string {
|
|
result := fmt.Sprintf(`KmerSetGroup Statistics (k=%d, size=%d):
|
|
Total memory: %.2f MB
|
|
|
|
Set breakdown:
|
|
`, ksgs.K, ksgs.Size, float64(ksgs.TotalBytes)/1024/1024)
|
|
|
|
for _, set := range ksgs.Sets {
|
|
result += fmt.Sprintf(" Set[%d]: %d k-mers (%.2f MB)\n",
|
|
set.Index,
|
|
set.Len,
|
|
float64(set.SizeBytes)/1024/1024)
|
|
}
|
|
|
|
return result
|
|
}
|