Files
obitools4/pkg/obistats/kmeans.go
Eric Coissac 0f8066d367 many changes ;-)
Former-commit-id: cb4aea844e960e4af4dc673ebc8eec49a7d12b13
2023-12-05 15:28:29 +01:00

364 lines
9.3 KiB
Go

package obistats
import (
"math"
"sync"
"time"
"golang.org/x/exp/rand"
"gonum.org/v1/gonum/stat/sampleuv"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
func squareDist(a, b []float64) float64 {
sum := 0.0
for i := 0; i < len(a); i++ {
diff := a[i] - b[i]
sum += diff * diff
}
return sum
}
func DefaultRG() *rand.Rand {
return rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
}
type KmeansClustering struct {
data *obiutils.Matrix[float64]
rg *rand.Rand
centers obiutils.Matrix[float64]
icenters []int
sizes []int
distmin []float64
classes []int
}
func MakeKmeansClustering(data *obiutils.Matrix[float64], k int, rg *rand.Rand) *KmeansClustering {
distmin := make([]float64, len(*data))
for i := 0; i < len(distmin); i++ {
distmin[i] = math.MaxFloat64
}
clustering := &KmeansClustering{
data: data,
icenters: make([]int, 0, k),
sizes: make([]int, 0, k),
centers: make(obiutils.Matrix[float64], 0, k),
distmin: distmin,
classes: make([]int, len(*data)),
rg: rg,
}
for i := 0; i < k; i++ {
clustering.AddACenter()
}
return clustering
}
// K returns the number of clusters in the K-means clustering algorithm.
//
// No parameters.
// Returns an integer.
func (clustering *KmeansClustering) K() int {
return len(clustering.icenters)
}
// N returns the size of the dataset in the KmeansClustering instance.
//
// It does not take any parameters.
// The return type is an integer.
func (clustering *KmeansClustering) N() int {
return len(*clustering.data)
}
// Dimension returns the dimension of the KmeansClustering data.
//
// No parameters.
// Returns an integer representing the dimension of the data.
func (clustering *KmeansClustering) Dimension() int {
return len((*clustering.data)[0])
}
func (clustering *KmeansClustering) AddACenter() {
C := 0
if clustering.K() == 0 {
C = rand.Intn(clustering.N())
} else {
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
C, _ = w.Take()
}
clustering.icenters = append(clustering.icenters, C)
clustering.sizes = append(clustering.sizes, 0)
center := (*clustering.data)[C]
clustering.centers = append(clustering.centers, center)
n := clustering.N()
for i := 0; i < n; i++ {
d := squareDist((*clustering.data)[i], center)
if d < clustering.distmin[i] {
clustering.distmin[i] = d
}
}
}
// ResetEmptyCenters resets the empty centers in the KmeansClustering struct.
//
// It iterates over the centers and checks if their corresponding sizes are zero.
// If a center is empty, a new weighted sample is taken with the help of the distmin and rg variables.
// The new center is then assigned to the empty center index, and the sizes and centers arrays are updated accordingly.
// Finally, the function returns the number of empty centers that were reset.
func (clustering *KmeansClustering) ResetEmptyCenters() int {
nreset := 0
for i := 0; i < clustering.K(); i++ {
if clustering.sizes[i] == 0 {
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
C, _ := w.Take()
clustering.icenters[i] = C
clustering.centers[i] = (*clustering.data)[C]
nreset++
}
}
return nreset
}
// AssignToClass assigns each data point to a class based on the distance to the nearest center.
//
// This function does not take any parameters.
// It does not return anything.
func (clustering *KmeansClustering) AssignToClass() {
var wg sync.WaitGroup
var lock sync.Mutex
for i := 0; i < clustering.K(); i++ {
clustering.sizes[i] = 0
}
for i := 0; i < clustering.N(); i++ {
clustering.distmin[i] = math.MaxFloat64
}
goroutine := func(i int) {
defer wg.Done()
dmin := math.MaxFloat64
cmin := -1
for j, center := range clustering.centers {
dist := squareDist((*clustering.data)[i], center)
if dist < dmin {
dmin = dist
cmin = j
}
}
lock.Lock()
clustering.classes[i] = cmin
clustering.sizes[cmin]++
clustering.distmin[i] = dmin
lock.Unlock()
}
wg.Add(clustering.N())
for i := 0; i < clustering.N(); i++ {
go goroutine(i)
}
nreset := clustering.ResetEmptyCenters()
if nreset > 0 {
log.Warnf("Reset %d empty centers", nreset)
clustering.AssignToClass()
}
}
// ComputeCenters calculates the centers of the K-means clustering algorithm.
//
// It takes no parameters.
// It does not return any values.
func (clustering *KmeansClustering) ComputeCenters() {
var wg sync.WaitGroup
centers := clustering.centers
data := clustering.data
classes := clustering.classes
k := clustering.K()
// Goroutine code
goroutine1 := func(centerIdx int) {
defer wg.Done()
for j, row := range *data {
class := classes[j]
if class == centerIdx {
for l, val := range row {
centers[centerIdx][l] += val
}
}
}
}
for i := 0; i < k; i++ {
wg.Add(1)
go goroutine1(i)
}
wg.Wait()
for i := range centers {
for j := range centers[i] {
centers[i][j] /= float64(clustering.sizes[i])
}
}
goroutine2 := func(centerIdx int) {
defer wg.Done()
dkmin := math.MaxFloat64
dki := -1
center := centers[centerIdx]
for j, row := range *data {
if classes[j] == centerIdx {
dist := squareDist(row, center)
if dist < dkmin {
dkmin = dist
dki = j
}
}
}
clustering.icenters[centerIdx] = dki
clustering.centers[centerIdx] = (*data)[dki]
}
for i := 0; i < k; i++ {
wg.Add(1)
go goroutine2(i)
}
wg.Wait()
}
func (clustering *KmeansClustering) Inertia() float64 {
inertia := 0.0
for i := 0; i < clustering.N(); i++ {
inertia += clustering.distmin[i]
}
return inertia
}
func (clustering *KmeansClustering) Centers() obiutils.Matrix[float64] {
return clustering.centers
}
func (clustering *KmeansClustering) CentersIndices() []int {
return clustering.icenters
}
func (clustering *KmeansClustering) Sizes() []int {
return clustering.sizes
}
func (clustering *KmeansClustering) Classes() []int {
return clustering.classes
}
func (clustering *KmeansClustering) Run(max_cycle int, threshold float64) bool {
prev := math.MaxFloat64
newI := clustering.Inertia()
for i := 0; i < max_cycle && (prev-newI) > threshold; i++ {
prev = newI
clustering.AssignToClass()
clustering.ComputeCenters()
newI = clustering.Inertia()
}
return (prev - newI) <= threshold
}
// // Kmeans performs the K-means clustering algorithm on the given data.
// // if centers and *center is not nil, centers is considered as initialized
// // and the number of classes (k) is set to the number of rows in centers.
// // overwise, the number of classes is defined by the value of k.
// // Parameters:
// // - data: A pointer to a Matrix[float64] that represents the input data.
// // - k: An integer that specifies the number of clusters to create.
// // - threshold: A float64 value that determines the convergence threshold.
// // - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
// // Returns:
// // - classes: A slice of integers that assigns each data point to a cluster.
// // - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
// // - inertia: A float64 value that represents the overall inertia of the clustering.
// // - converged: A boolean value indicating whether the algorithm converged.
// func Kmeans(data *obiutils.Matrix[float64],
// k int,
// threshold float64,
// centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
// if centers == nil || *centers == nil {
// *centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
// center_ids := SampleIntWithoutReplacement(k, len(*data))
// for i, id := range center_ids {
// (*centers)[i] = (*data)[id]
// }
// } else {
// k = len(*centers)
// }
// classes := AssignToClass(data, centers)
// centers = ComputeCenters(data, k, classes)
// inertia := ComputeInertia(data, classes, centers)
// delta := threshold * 100.0
// for i := 0; i < 100 && delta > threshold; i++ {
// classes = AssignToClass(data, centers)
// centers = ComputeCenters(data, k, classes)
// newi := ComputeInertia(data, classes, centers)
// delta = inertia - newi
// inertia = newi
// log.Debugf("Inertia: %f, delta: %f", inertia, delta)
// }
// return classes, centers, inertia, delta < threshold
// }
// // KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel.
// //
// // It takes a matrix of data points and a matrix of centers as input.
// // The best representative is the data point that is closest to the center of the cluster.
// // Returns an array of integers containing the index of the best representative for each cluster.
// func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
// bestRepresentative := make([]int, len(*centers))
// var wg sync.WaitGroup
// wg.Add(len(*centers))
// for j, center := range *centers {
// go func(j int, center []float64) {
// defer wg.Done()
// bestDistToCenter := math.MaxFloat64
// best := -1
// for i, row := range *data {
// dist := 0.0
// for d, val := range row {
// diff := val - center[d]
// dist += diff * diff
// }
// if dist < bestDistToCenter {
// bestDistToCenter = dist
// best = i
// }
// }
// if best == -1 {
// log.Fatalf("No representative found for cluster %d", j)
// }
// bestRepresentative[j] = best
// }(j, center)
// }
// wg.Wait()
// return bestRepresentative
// }