package obistats import ( "math" "sync" "time" "golang.org/x/exp/rand" "gonum.org/v1/gonum/stat/sampleuv" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" log "github.com/sirupsen/logrus" ) // SquareDist calculates the squared Euclidean distance between // two vectors 'a' and 'b'. // // 'a' and 'b' are slices of float64 or int values representing // coordinate points in space. It is assumed that both slices // have the same length. // Returns the calculated squared distance as a float64. func SquareDist[T float64 | int](a, b []T) T { sum := T(0) for i, v := range a { diff := v - b[i] sum += diff * diff } return sum } // EuclideanDist calculates the Euclidean distance between // two vectors represented as slices of float64. // // `a` and `b` are slices of float64 where each element of `a` // is paired with the corresponding element of `b`. // Returns the squared sum of the differences. func EuclideanDist[T float64 | int](a, b []T) float64 { return math.Sqrt(float64(SquareDist(a, b))) } // DefaultRG creates and returns a new instance of *rand.Rand. // // No parameters. // Returns *rand.Rand which is a pointer to a new random number // generator, seeded with the current time in nanoseconds. func DefaultRG() *rand.Rand { return rand.New(rand.NewSource(uint64(time.Now().UnixNano()))) } type KmeansClustering struct { data *obiutils.Matrix[float64] // data matrix dimensions: n x p distmin []float64 // distance to closest center dimension: n classes []int // class of each data point dimension: n rg *rand.Rand // random number generator centers obiutils.Matrix[float64] // centers coordinates dimensions: k x p icenters []int // indices of centers dimension: k sizes []int // number of elements in each cluster dimension: k } // MakeKmeansClustering initializes a KmeansClustering with the // provided matrix data, number of clusters k, and random number // generator rg. // // data is a pointer to a Matrix of float64 representing the dataset, // k is the number of desired clusters, and rg is a pointer to a // random number generator used in the clustering process. // Returns a pointer to the initialized KmeansClustering structure. func MakeKmeansClustering(data *obiutils.Matrix[float64], k int, rg *rand.Rand) *KmeansClustering { distmin := make([]float64, len(*data)) classes := make([]int, len(*data)) for i := 0; i < len(distmin); i++ { distmin[i] = math.MaxFloat64 classes[i] = -1 } clustering := &KmeansClustering{ data: data, distmin: distmin, classes: classes, rg: rg, centers: make(obiutils.Matrix[float64], 0, k), icenters: make([]int, 0, k), sizes: make([]int, 0, k), } for i := 0; i < k; i++ { clustering.AddACenter() } return clustering } // K returns the number of clusters in the K-means clustering algorithm. // // No parameters. // Returns an integer. func (clustering *KmeansClustering) K() int { return len(clustering.icenters) } // N returns the size of the dataset in the KmeansClustering instance. // // It does not take any parameters. // The return type is an integer. func (clustering *KmeansClustering) N() int { return len(*clustering.data) } // Dimension returns the dimension of the KmeansClustering data. // // No parameters. // Returns an integer representing the dimension of the data. func (clustering *KmeansClustering) Dimension() int { return len((*clustering.data)[0]) } // SetCenterTo sets the center of a specific cluster to a given data point index. // // Parameters: // - k: the index of the cluster, if k=-1, a new center is added // - i: the index of the data point // - reset: a boolean indicating whether to reset the distances to the nearest center // for points previously assigned to this center // // No return value. func (clustering *KmeansClustering) SetCenterTo(k, i int, reset bool) { N := clustering.N() K := clustering.K() center := (*clustering.data)[i] if k >= 0 { clustering.icenters[k] = i clustering.sizes[k] = 0 clustering.centers[k] = center if reset { // Recompute distances to the nearest center for points // previously assigned to this center K := clustering.K() for j := 0; j < N; j++ { if clustering.classes[j] == k { clustering.distmin[j] = math.MaxFloat64 for l := 1; l < K; l++ { dist := EuclideanDist((*clustering.data)[j], clustering.centers[l]) if dist < clustering.distmin[j] { clustering.distmin[j] = dist clustering.classes[j] = l } } clustering.sizes[clustering.classes[j]]++ } } } } else { clustering.icenters = append(clustering.icenters, i) clustering.sizes = append(clustering.sizes, 0) clustering.centers = append(clustering.centers, center) k = K K++ } for j := 0; j < clustering.N(); j++ { dist := EuclideanDist((*clustering.data)[j], center) if dist < clustering.distmin[j] { if C := clustering.classes[j]; C >= 0 { clustering.sizes[C]-- } clustering.distmin[j] = dist clustering.classes[j] = k clustering.sizes[k]++ } } } // AddACenter adds a new center to the KmeansClustering. // // If there are no centers, it randomly selects a new center. // If there are existing centers, it selects a new center with // probability proportional to its distance from the nearest // center. The center is then added to the clustering. func (clustering *KmeansClustering) AddACenter() { k := clustering.K() C := 0 if k == 0 { // if there are no centers yet, draw a sample as the first center C = rand.Intn(clustering.N()) } else { // otherwise, draw a sample with a probability proportional // to its closest distance to a center w := sampleuv.NewWeighted(clustering.distmin, clustering.rg) C, _ = w.Take() } clustering.SetCenterTo(-1, C, false) } // ResetEmptyCenters reinitializes any centers in a KmeansClustering // that have no assigned points. // // This method iterates over the centers and uses a weighted sampling // to reset centers with a size of zero. // Returns the number of centers that were reset. func (clustering *KmeansClustering) ResetEmptyCenters() int { nreset := 0 for i := 0; i < clustering.K(); i++ { if clustering.sizes[i] == 0 { w := sampleuv.NewWeighted(clustering.distmin, clustering.rg) C, _ := w.Take() clustering.SetCenterTo(i, C, false) nreset++ } } return nreset } // ClosestPoint finds the index of the closest point in the // clustering to the given coordinates. // // coordinates is a slice of float64 representing the point. // Returns the index of the closest point as an int. func (clustering *KmeansClustering) ClosestPoint(coordinates []float64) int { N := clustering.N() distmin := math.MaxFloat64 C := -1 for i := 0; i < N; i++ { dist := EuclideanDist((*clustering.data)[i], coordinates) if dist < distmin { distmin = dist C = i } } return C } // AssignToClass assigns each data point in the dataset to the nearest // center (class) in a K-means clustering algorithm. // // Handles the reinitialization of empty centers after the assignment. // No return values. func (clustering *KmeansClustering) AssignToClass() { var wg sync.WaitGroup var lock sync.Mutex // initialize the number of points in each class for i := 0; i < clustering.K(); i++ { clustering.sizes[i] = 0 } goroutine := func(i int) { defer wg.Done() dmin := math.MaxFloat64 cmin := -1 for j, center := range clustering.centers { dist := EuclideanDist((*clustering.data)[i], center) if dist < dmin { dmin = dist cmin = j } } clustering.classes[i] = cmin clustering.distmin[i] = dmin lock.Lock() clustering.sizes[cmin]++ lock.Unlock() } wg.Add(clustering.N()) for i := 0; i < clustering.N(); i++ { go goroutine(i) } wg.Wait() nreset := clustering.ResetEmptyCenters() if nreset > 0 { log.Warnf("Reseted %d empty centers", nreset) } } // SetCentersTo assigns new centers in the KmeansClustering // structure given a slice of indices. // // The indices parameter is a slice of integers that // corresponds to the new indices of the cluster centers in // the dataset. It panics if any index is out of bounds. // This method does not return any value. func (clustering *KmeansClustering) SetCentersTo(indices []int) { for _, v := range indices { if v < 0 || v >= clustering.N() { log.Fatalf("Invalid center index: %d", v) } } clustering.icenters = indices K := len(indices) for i := 0; i < K; i++ { clustering.centers[i] = (*clustering.data)[indices[i]] } clustering.AssignToClass() } // ComputeCenters calculates the centers of the K-means clustering algorithm. // // This method call AssignToClass() after computing the centers to ensure coherence // of the clustering data structure. // // It takes no parameters. // It does not return any values. func (clustering *KmeansClustering) ComputeCenters() { var wg sync.WaitGroup centers := clustering.centers data := clustering.data classes := clustering.classes K := clustering.K() // compute the location of center of class centerIdx // as the point in the data the closest to the // center of class centerIdx newCenter := func(centerIdx int) { defer wg.Done() center := make([]float64, clustering.Dimension()) for j := range center { center[j] = 0 } for j, row := range *data { if classes[j] == centerIdx { for l, val := range row { center[l] += val } } } for j := range centers[centerIdx] { center[j] /= float64(clustering.sizes[centerIdx]) } C := clustering.ClosestPoint(center) centers[centerIdx] = (*data)[C] clustering.icenters[centerIdx] = C } for i := 0; i < K; i++ { wg.Add(1) go newCenter(i) } wg.Wait() clustering.AssignToClass() } func (clustering *KmeansClustering) Inertia() float64 { inertia := 0.0 for i := 0; i < clustering.N(); i++ { inertia += clustering.distmin[i] * clustering.distmin[i] } return inertia } func (clustering *KmeansClustering) Centers() obiutils.Matrix[float64] { return clustering.centers } func (clustering *KmeansClustering) CentersIndices() []int { return clustering.icenters } func (clustering *KmeansClustering) Sizes() []int { return clustering.sizes } func (clustering *KmeansClustering) Classes() []int { return clustering.classes } func (clustering *KmeansClustering) Run(max_cycle int, threshold float64) bool { prev := math.MaxFloat64 newI := clustering.Inertia() for i := 0; i < max_cycle && (prev-newI) > threshold; i++ { prev = newI clustering.ComputeCenters() newI = clustering.Inertia() } return (prev - newI) <= threshold } // // Kmeans performs the K-means clustering algorithm on the given data. // // if centers and *center is not nil, centers is considered as initialized // // and the number of classes (k) is set to the number of rows in centers. // // overwise, the number of classes is defined by the value of k. // // Parameters: // // - data: A pointer to a Matrix[float64] that represents the input data. // // - k: An integer that specifies the number of clusters to create. // // - threshold: A float64 value that determines the convergence threshold. // // - centers: A pointer to a Matrix[float64] that represents the initial cluster centers. // // Returns: // // - classes: A slice of integers that assigns each data point to a cluster. // // - centers: A pointer to a Matrix[float64] that contains the final cluster centers. // // - inertia: A float64 value that represents the overall inertia of the clustering. // // - converged: A boolean value indicating whether the algorithm converged. // func Kmeans(data *obiutils.Matrix[float64], // k int, // threshold float64, // centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) { // if centers == nil || *centers == nil { // *centers = obiutils.Make2DArray[float64](k, len((*data)[0])) // center_ids := SampleIntWithoutReplacement(k, len(*data)) // for i, id := range center_ids { // (*centers)[i] = (*data)[id] // } // } else { // k = len(*centers) // } // classes := AssignToClass(data, centers) // centers = ComputeCenters(data, k, classes) // inertia := ComputeInertia(data, classes, centers) // delta := threshold * 100.0 // for i := 0; i < 100 && delta > threshold; i++ { // classes = AssignToClass(data, centers) // centers = ComputeCenters(data, k, classes) // newi := ComputeInertia(data, classes, centers) // delta = inertia - newi // inertia = newi // log.Debugf("Inertia: %f, delta: %f", inertia, delta) // } // return classes, centers, inertia, delta < threshold // } // // KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel. // // // // It takes a matrix of data points and a matrix of centers as input. // // The best representative is the data point that is closest to the center of the cluster. // // Returns an array of integers containing the index of the best representative for each cluster. // func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int { // bestRepresentative := make([]int, len(*centers)) // var wg sync.WaitGroup // wg.Add(len(*centers)) // for j, center := range *centers { // go func(j int, center []float64) { // defer wg.Done() // bestDistToCenter := math.MaxFloat64 // best := -1 // for i, row := range *data { // dist := 0.0 // for d, val := range row { // diff := val - center[d] // dist += diff * diff // } // if dist < bestDistToCenter { // bestDistToCenter = dist // best = i // } // } // if best == -1 { // log.Fatalf("No representative found for cluster %d", j) // } // bestRepresentative[j] = best // }(j, center) // } // wg.Wait() // return bestRepresentative // }