package obistats import ( "math" "sync" "time" "golang.org/x/exp/rand" "gonum.org/v1/gonum/stat/sampleuv" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" log "github.com/sirupsen/logrus" ) func squareDist(a, b []float64) float64 { sum := 0.0 for i := 0; i < len(a); i++ { diff := a[i] - b[i] sum += diff * diff } return sum } func DefaultRG() *rand.Rand { return rand.New(rand.NewSource(uint64(time.Now().UnixNano()))) } type KmeansClustering struct { data *obiutils.Matrix[float64] rg *rand.Rand centers obiutils.Matrix[float64] icenters []int sizes []int distmin []float64 classes []int } func MakeKmeansClustering(data *obiutils.Matrix[float64], k int, rg *rand.Rand) *KmeansClustering { distmin := make([]float64, len(*data)) for i := 0; i < len(distmin); i++ { distmin[i] = math.MaxFloat64 } clustering := &KmeansClustering{ data: data, icenters: make([]int, 0, k), sizes: make([]int, 0, k), centers: make(obiutils.Matrix[float64], 0, k), distmin: distmin, classes: make([]int, len(*data)), rg: rg, } for i := 0; i < k; i++ { clustering.AddACenter() } return clustering } // K returns the number of clusters in the K-means clustering algorithm. // // No parameters. // Returns an integer. func (clustering *KmeansClustering) K() int { return len(clustering.icenters) } // N returns the size of the dataset in the KmeansClustering instance. // // It does not take any parameters. // The return type is an integer. func (clustering *KmeansClustering) N() int { return len(*clustering.data) } // Dimension returns the dimension of the KmeansClustering data. // // No parameters. // Returns an integer representing the dimension of the data. func (clustering *KmeansClustering) Dimension() int { return len((*clustering.data)[0]) } func (clustering *KmeansClustering) AddACenter() { C := 0 if clustering.K() == 0 { C = rand.Intn(clustering.N()) } else { w := sampleuv.NewWeighted(clustering.distmin, clustering.rg) C, _ = w.Take() } clustering.icenters = append(clustering.icenters, C) clustering.sizes = append(clustering.sizes, 0) center := (*clustering.data)[C] clustering.centers = append(clustering.centers, center) n := clustering.N() for i := 0; i < n; i++ { d := squareDist((*clustering.data)[i], center) if d < clustering.distmin[i] { clustering.distmin[i] = d } } } // ResetEmptyCenters resets the empty centers in the KmeansClustering struct. // // It iterates over the centers and checks if their corresponding sizes are zero. // If a center is empty, a new weighted sample is taken with the help of the distmin and rg variables. // The new center is then assigned to the empty center index, and the sizes and centers arrays are updated accordingly. // Finally, the function returns the number of empty centers that were reset. func (clustering *KmeansClustering) ResetEmptyCenters() int { nreset := 0 for i := 0; i < clustering.K(); i++ { if clustering.sizes[i] == 0 { w := sampleuv.NewWeighted(clustering.distmin, clustering.rg) C, _ := w.Take() clustering.icenters[i] = C clustering.centers[i] = (*clustering.data)[C] nreset++ } } return nreset } // AssignToClass assigns each data point to a class based on the distance to the nearest center. // // This function does not take any parameters. // It does not return anything. func (clustering *KmeansClustering) AssignToClass() { var wg sync.WaitGroup var lock sync.Mutex for i := 0; i < clustering.K(); i++ { clustering.sizes[i] = 0 } for i := 0; i < clustering.N(); i++ { clustering.distmin[i] = math.MaxFloat64 } goroutine := func(i int) { defer wg.Done() dmin := math.MaxFloat64 cmin := -1 for j, center := range clustering.centers { dist := squareDist((*clustering.data)[i], center) if dist < dmin { dmin = dist cmin = j } } lock.Lock() clustering.classes[i] = cmin clustering.sizes[cmin]++ clustering.distmin[i] = dmin lock.Unlock() } wg.Add(clustering.N()) for i := 0; i < clustering.N(); i++ { go goroutine(i) } nreset := clustering.ResetEmptyCenters() if nreset > 0 { log.Warnf("Reset %d empty centers", nreset) clustering.AssignToClass() } } // ComputeCenters calculates the centers of the K-means clustering algorithm. // // It takes no parameters. // It does not return any values. func (clustering *KmeansClustering) ComputeCenters() { var wg sync.WaitGroup centers := clustering.centers data := clustering.data classes := clustering.classes k := clustering.K() // Goroutine code goroutine1 := func(centerIdx int) { defer wg.Done() for j, row := range *data { class := classes[j] if class == centerIdx { for l, val := range row { centers[centerIdx][l] += val } } } } for i := 0; i < k; i++ { wg.Add(1) go goroutine1(i) } wg.Wait() for i := range centers { for j := range centers[i] { centers[i][j] /= float64(clustering.sizes[i]) } } goroutine2 := func(centerIdx int) { defer wg.Done() dkmin := math.MaxFloat64 dki := -1 center := centers[centerIdx] for j, row := range *data { if classes[j] == centerIdx { dist := squareDist(row, center) if dist < dkmin { dkmin = dist dki = j } } } clustering.icenters[centerIdx] = dki clustering.centers[centerIdx] = (*data)[dki] } for i := 0; i < k; i++ { wg.Add(1) go goroutine2(i) } wg.Wait() } func (clustering *KmeansClustering) Inertia() float64 { inertia := 0.0 for i := 0; i < clustering.N(); i++ { inertia += clustering.distmin[i] } return inertia } func (clustering *KmeansClustering) Centers() obiutils.Matrix[float64] { return clustering.centers } func (clustering *KmeansClustering) CentersIndices() []int { return clustering.icenters } func (clustering *KmeansClustering) Sizes() []int { return clustering.sizes } func (clustering *KmeansClustering) Classes() []int { return clustering.classes } func (clustering *KmeansClustering) Run(max_cycle int, threshold float64) bool { prev := math.MaxFloat64 newI := clustering.Inertia() for i := 0; i < max_cycle && (prev-newI) > threshold; i++ { prev = newI clustering.AssignToClass() clustering.ComputeCenters() newI = clustering.Inertia() } return (prev - newI) <= threshold } // // Kmeans performs the K-means clustering algorithm on the given data. // // if centers and *center is not nil, centers is considered as initialized // // and the number of classes (k) is set to the number of rows in centers. // // overwise, the number of classes is defined by the value of k. // // Parameters: // // - data: A pointer to a Matrix[float64] that represents the input data. // // - k: An integer that specifies the number of clusters to create. // // - threshold: A float64 value that determines the convergence threshold. // // - centers: A pointer to a Matrix[float64] that represents the initial cluster centers. // // Returns: // // - classes: A slice of integers that assigns each data point to a cluster. // // - centers: A pointer to a Matrix[float64] that contains the final cluster centers. // // - inertia: A float64 value that represents the overall inertia of the clustering. // // - converged: A boolean value indicating whether the algorithm converged. // func Kmeans(data *obiutils.Matrix[float64], // k int, // threshold float64, // centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) { // if centers == nil || *centers == nil { // *centers = obiutils.Make2DArray[float64](k, len((*data)[0])) // center_ids := SampleIntWithoutReplacement(k, len(*data)) // for i, id := range center_ids { // (*centers)[i] = (*data)[id] // } // } else { // k = len(*centers) // } // classes := AssignToClass(data, centers) // centers = ComputeCenters(data, k, classes) // inertia := ComputeInertia(data, classes, centers) // delta := threshold * 100.0 // for i := 0; i < 100 && delta > threshold; i++ { // classes = AssignToClass(data, centers) // centers = ComputeCenters(data, k, classes) // newi := ComputeInertia(data, classes, centers) // delta = inertia - newi // inertia = newi // log.Debugf("Inertia: %f, delta: %f", inertia, delta) // } // return classes, centers, inertia, delta < threshold // } // // KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel. // // // // It takes a matrix of data points and a matrix of centers as input. // // The best representative is the data point that is closest to the center of the cluster. // // Returns an array of integers containing the index of the best representative for each cluster. // func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int { // bestRepresentative := make([]int, len(*centers)) // var wg sync.WaitGroup // wg.Add(len(*centers)) // for j, center := range *centers { // go func(j int, center []float64) { // defer wg.Done() // bestDistToCenter := math.MaxFloat64 // best := -1 // for i, row := range *data { // dist := 0.0 // for d, val := range row { // diff := val - center[d] // dist += diff * diff // } // if dist < bestDistToCenter { // bestDistToCenter = dist // best = i // } // } // if best == -1 { // log.Fatalf("No representative found for cluster %d", j) // } // bestRepresentative[j] = best // }(j, center) // } // wg.Wait() // return bestRepresentative // }