mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
248 lines
6.6 KiB
Go
248 lines
6.6 KiB
Go
package obistats
|
|
|
|
import (
|
|
"math"
|
|
"sync"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
// AssignToClass applies the nearest neighbor algorithm to assign data points to classes.
|
|
//
|
|
// Parameters:
|
|
// - data: a 2D slice of float64 representing the data points to be assigned.
|
|
// - centers: a 2D slice of float64 representing the center points for each class.
|
|
//
|
|
// Return:
|
|
// - classes: a slice of int representing the assigned class for each data point.
|
|
func AssignToClass(data, centers *obiutils.Matrix[float64]) []int {
|
|
classes := make([]int, len(*data))
|
|
numData := len(*data)
|
|
numCenters := len(*centers)
|
|
|
|
var wg sync.WaitGroup
|
|
wg.Add(numData)
|
|
|
|
for i := 0; i < numData; i++ {
|
|
go func(i int) {
|
|
defer wg.Done()
|
|
minDist := math.MaxFloat64
|
|
minDistIndex := -1
|
|
rowData := (*data)[i]
|
|
|
|
for j := 0; j < numCenters; j++ {
|
|
centerData := (*centers)[j]
|
|
dist := 0.0
|
|
|
|
for d, val := range rowData {
|
|
diff := val - centerData[d]
|
|
dist += diff * diff
|
|
}
|
|
|
|
if dist < minDist {
|
|
minDist = dist
|
|
minDistIndex = j
|
|
}
|
|
}
|
|
|
|
classes[i] = minDistIndex
|
|
}(i)
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
return classes
|
|
}
|
|
|
|
// ComputeCenters calculates the centers of clusters for a given data set.
|
|
//
|
|
// Parameters:
|
|
// - data: a pointer to a matrix of float64 values representing the data set.
|
|
// - k: an integer representing the number of clusters.
|
|
// - classes: a slice of integers representing the assigned cluster for each data point.
|
|
//
|
|
// Returns:
|
|
// - centers: a pointer to a matrix of float64 values representing the centers of the clusters.
|
|
// ComputeCenters calculates the centers of clusters for a given data set.
|
|
//
|
|
// Parameters:
|
|
// - data: a pointer to a matrix of float64 values representing the data set.
|
|
// - k: an integer representing the number of clusters.
|
|
// - classes: a slice of integers representing the assigned cluster for each data point.
|
|
//
|
|
// Returns:
|
|
// - centers: a pointer to a matrix of float64 values representing the centers of the clusters.
|
|
func ComputeCenters(data *obiutils.Matrix[float64], k int, classes []int) *obiutils.Matrix[float64] {
|
|
centers := obiutils.Make2DNumericArray[float64](k, len((*data)[0]), true)
|
|
ns := make([]int, k)
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
for i := range ns {
|
|
ns[i] = 0
|
|
}
|
|
|
|
// Goroutine code
|
|
goroutine := func(centerIdx int) {
|
|
defer wg.Done()
|
|
for j, row := range *data {
|
|
class := classes[j]
|
|
if class == centerIdx {
|
|
ns[centerIdx]++
|
|
for l, val := range row {
|
|
centers[centerIdx][l] += val
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for i := 0; i < k; i++ {
|
|
wg.Add(1)
|
|
go goroutine(i)
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
for i := range centers {
|
|
for j := range centers[i] {
|
|
centers[i][j] /= float64(ns[i])
|
|
}
|
|
}
|
|
|
|
return ¢ers
|
|
}
|
|
|
|
// ComputeInertia computes the inertia of the given data and centers in parallel.
|
|
//
|
|
// Parameters:
|
|
// - data: A pointer to a Matrix of float64 representing the data.
|
|
// - classes: A slice of int representing the class labels for each data point.
|
|
// - centers: A pointer to a Matrix of float64 representing the centers.
|
|
//
|
|
// Return type:
|
|
// - float64: The computed inertia.
|
|
func ComputeInertia(data *obiutils.Matrix[float64], classes []int, centers *obiutils.Matrix[float64]) float64 {
|
|
inertia := make(chan float64)
|
|
numRows := len(*data)
|
|
wg := sync.WaitGroup{}
|
|
wg.Add(numRows)
|
|
|
|
for i := 0; i < numRows; i++ {
|
|
go func(i int) {
|
|
defer wg.Done()
|
|
row := (*data)[i]
|
|
class := classes[i]
|
|
center := (*centers)[class]
|
|
inertiaLocal := 0.0
|
|
for j, val := range row {
|
|
diff := val - center[j]
|
|
inertiaLocal += diff * diff
|
|
}
|
|
inertia <- inertiaLocal
|
|
}(i)
|
|
}
|
|
|
|
go func() {
|
|
wg.Wait()
|
|
close(inertia)
|
|
}()
|
|
|
|
totalInertia := 0.0
|
|
for localInertia := range inertia {
|
|
totalInertia += localInertia
|
|
}
|
|
|
|
return totalInertia
|
|
}
|
|
|
|
// Kmeans performs the K-means clustering algorithm on the given data.
|
|
//
|
|
// if centers and *center is not nil, centers is considered as initialized
|
|
// and the number of classes (k) is set to the number of rows in centers.
|
|
// overwise, the number of classes is defined by the value of k.
|
|
//
|
|
// Parameters:
|
|
// - data: A pointer to a Matrix[float64] that represents the input data.
|
|
// - k: An integer that specifies the number of clusters to create.
|
|
// - threshold: A float64 value that determines the convergence threshold.
|
|
// - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
|
|
//
|
|
// Returns:
|
|
// - classes: A slice of integers that assigns each data point to a cluster.
|
|
// - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
|
|
// - inertia: A float64 value that represents the overall inertia of the clustering.
|
|
// - converged: A boolean value indicating whether the algorithm converged.
|
|
func Kmeans(data *obiutils.Matrix[float64],
|
|
k int,
|
|
threshold float64,
|
|
centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
|
|
if centers == nil || *centers == nil {
|
|
*centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
|
|
center_ids := SampleIntWithoutReplacement(k, len(*data))
|
|
for i, id := range center_ids {
|
|
(*centers)[i] = (*data)[id]
|
|
}
|
|
} else {
|
|
k = len(*centers)
|
|
}
|
|
|
|
classes := AssignToClass(data, centers)
|
|
centers = ComputeCenters(data, k, classes)
|
|
inertia := ComputeInertia(data, classes, centers)
|
|
delta := threshold * 100.0
|
|
for i := 0; i < 100 && delta > threshold; i++ {
|
|
classes = AssignToClass(data, centers)
|
|
centers = ComputeCenters(data, k, classes)
|
|
newi := ComputeInertia(data, classes, centers)
|
|
delta = inertia - newi
|
|
inertia = newi
|
|
log.Debugf("Inertia: %f, delta: %f", inertia, delta)
|
|
}
|
|
|
|
return classes, centers, inertia, delta < threshold
|
|
}
|
|
|
|
// KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel.
|
|
//
|
|
// It takes a matrix of data points and a matrix of centers as input.
|
|
// The best representative is the data point that is closest to the center of the cluster.
|
|
// Returns an array of integers containing the index of the best representative for each cluster.
|
|
func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
|
|
bestRepresentative := make([]int, len(*centers))
|
|
|
|
var wg sync.WaitGroup
|
|
wg.Add(len(*centers))
|
|
|
|
for j, center := range *centers {
|
|
go func(j int, center []float64) {
|
|
defer wg.Done()
|
|
|
|
bestDistToCenter := math.MaxFloat64
|
|
best := -1
|
|
|
|
for i, row := range *data {
|
|
dist := 0.0
|
|
for d, val := range row {
|
|
diff := val - center[d]
|
|
dist += diff * diff
|
|
}
|
|
if dist < bestDistToCenter {
|
|
bestDistToCenter = dist
|
|
best = i
|
|
}
|
|
}
|
|
|
|
if best == -1 {
|
|
log.Fatalf("No representative found for cluster %d", j)
|
|
}
|
|
|
|
bestRepresentative[j] = best
|
|
}(j, center)
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
return bestRepresentative
|
|
}
|