mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
many changes ;-)
Former-commit-id: cb4aea844e960e4af4dc673ebc8eec49a7d12b13
This commit is contained in:
@ -3,93 +3,190 @@ package obistats
|
||||
import (
|
||||
"math"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/exp/rand"
|
||||
"gonum.org/v1/gonum/stat/sampleuv"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// AssignToClass applies the nearest neighbor algorithm to assign data points to classes.
|
||||
//
|
||||
// Parameters:
|
||||
// - data: a 2D slice of float64 representing the data points to be assigned.
|
||||
// - centers: a 2D slice of float64 representing the center points for each class.
|
||||
//
|
||||
// Return:
|
||||
// - classes: a slice of int representing the assigned class for each data point.
|
||||
func AssignToClass(data, centers *obiutils.Matrix[float64]) []int {
|
||||
classes := make([]int, len(*data))
|
||||
numData := len(*data)
|
||||
numCenters := len(*centers)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(numData)
|
||||
|
||||
for i := 0; i < numData; i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
minDist := math.MaxFloat64
|
||||
minDistIndex := -1
|
||||
rowData := (*data)[i]
|
||||
|
||||
for j := 0; j < numCenters; j++ {
|
||||
centerData := (*centers)[j]
|
||||
dist := 0.0
|
||||
|
||||
for d, val := range rowData {
|
||||
diff := val - centerData[d]
|
||||
dist += diff * diff
|
||||
}
|
||||
|
||||
if dist < minDist {
|
||||
minDist = dist
|
||||
minDistIndex = j
|
||||
}
|
||||
}
|
||||
|
||||
classes[i] = minDistIndex
|
||||
}(i)
|
||||
func squareDist(a, b []float64) float64 {
|
||||
sum := 0.0
|
||||
for i := 0; i < len(a); i++ {
|
||||
diff := a[i] - b[i]
|
||||
sum += diff * diff
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
return classes
|
||||
return sum
|
||||
}
|
||||
|
||||
// ComputeCenters calculates the centers of clusters for a given data set.
|
||||
//
|
||||
// Parameters:
|
||||
// - data: a pointer to a matrix of float64 values representing the data set.
|
||||
// - k: an integer representing the number of clusters.
|
||||
// - classes: a slice of integers representing the assigned cluster for each data point.
|
||||
//
|
||||
// Returns:
|
||||
// - centers: a pointer to a matrix of float64 values representing the centers of the clusters.
|
||||
// ComputeCenters calculates the centers of clusters for a given data set.
|
||||
//
|
||||
// Parameters:
|
||||
// - data: a pointer to a matrix of float64 values representing the data set.
|
||||
// - k: an integer representing the number of clusters.
|
||||
// - classes: a slice of integers representing the assigned cluster for each data point.
|
||||
//
|
||||
// Returns:
|
||||
// - centers: a pointer to a matrix of float64 values representing the centers of the clusters.
|
||||
func ComputeCenters(data *obiutils.Matrix[float64], k int, classes []int) *obiutils.Matrix[float64] {
|
||||
centers := obiutils.Make2DNumericArray[float64](k, len((*data)[0]), true)
|
||||
ns := make([]int, k)
|
||||
func DefaultRG() *rand.Rand {
|
||||
return rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
type KmeansClustering struct {
|
||||
data *obiutils.Matrix[float64]
|
||||
rg *rand.Rand
|
||||
centers obiutils.Matrix[float64]
|
||||
icenters []int
|
||||
sizes []int
|
||||
distmin []float64
|
||||
classes []int
|
||||
}
|
||||
|
||||
for i := range ns {
|
||||
ns[i] = 0
|
||||
func MakeKmeansClustering(data *obiutils.Matrix[float64], k int, rg *rand.Rand) *KmeansClustering {
|
||||
distmin := make([]float64, len(*data))
|
||||
for i := 0; i < len(distmin); i++ {
|
||||
distmin[i] = math.MaxFloat64
|
||||
}
|
||||
|
||||
clustering := &KmeansClustering{
|
||||
data: data,
|
||||
icenters: make([]int, 0, k),
|
||||
sizes: make([]int, 0, k),
|
||||
centers: make(obiutils.Matrix[float64], 0, k),
|
||||
distmin: distmin,
|
||||
classes: make([]int, len(*data)),
|
||||
rg: rg,
|
||||
}
|
||||
|
||||
for i := 0; i < k; i++ {
|
||||
clustering.AddACenter()
|
||||
}
|
||||
|
||||
return clustering
|
||||
}
|
||||
|
||||
// K returns the number of clusters in the K-means clustering algorithm.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns an integer.
|
||||
func (clustering *KmeansClustering) K() int {
|
||||
return len(clustering.icenters)
|
||||
}
|
||||
|
||||
// N returns the size of the dataset in the KmeansClustering instance.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// The return type is an integer.
|
||||
func (clustering *KmeansClustering) N() int {
|
||||
return len(*clustering.data)
|
||||
}
|
||||
|
||||
// Dimension returns the dimension of the KmeansClustering data.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns an integer representing the dimension of the data.
|
||||
func (clustering *KmeansClustering) Dimension() int {
|
||||
return len((*clustering.data)[0])
|
||||
}
|
||||
func (clustering *KmeansClustering) AddACenter() {
|
||||
C := 0
|
||||
if clustering.K() == 0 {
|
||||
C = rand.Intn(clustering.N())
|
||||
} else {
|
||||
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
|
||||
C, _ = w.Take()
|
||||
}
|
||||
clustering.icenters = append(clustering.icenters, C)
|
||||
clustering.sizes = append(clustering.sizes, 0)
|
||||
center := (*clustering.data)[C]
|
||||
clustering.centers = append(clustering.centers, center)
|
||||
|
||||
n := clustering.N()
|
||||
|
||||
for i := 0; i < n; i++ {
|
||||
d := squareDist((*clustering.data)[i], center)
|
||||
if d < clustering.distmin[i] {
|
||||
clustering.distmin[i] = d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ResetEmptyCenters resets the empty centers in the KmeansClustering struct.
|
||||
//
|
||||
// It iterates over the centers and checks if their corresponding sizes are zero.
|
||||
// If a center is empty, a new weighted sample is taken with the help of the distmin and rg variables.
|
||||
// The new center is then assigned to the empty center index, and the sizes and centers arrays are updated accordingly.
|
||||
// Finally, the function returns the number of empty centers that were reset.
|
||||
func (clustering *KmeansClustering) ResetEmptyCenters() int {
|
||||
nreset := 0
|
||||
for i := 0; i < clustering.K(); i++ {
|
||||
if clustering.sizes[i] == 0 {
|
||||
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
|
||||
C, _ := w.Take()
|
||||
clustering.icenters[i] = C
|
||||
clustering.centers[i] = (*clustering.data)[C]
|
||||
nreset++
|
||||
}
|
||||
}
|
||||
return nreset
|
||||
}
|
||||
|
||||
// AssignToClass assigns each data point to a class based on the distance to the nearest center.
|
||||
//
|
||||
// This function does not take any parameters.
|
||||
// It does not return anything.
|
||||
func (clustering *KmeansClustering) AssignToClass() {
|
||||
var wg sync.WaitGroup
|
||||
var lock sync.Mutex
|
||||
|
||||
for i := 0; i < clustering.K(); i++ {
|
||||
clustering.sizes[i] = 0
|
||||
}
|
||||
for i := 0; i < clustering.N(); i++ {
|
||||
clustering.distmin[i] = math.MaxFloat64
|
||||
}
|
||||
|
||||
goroutine := func(i int) {
|
||||
defer wg.Done()
|
||||
dmin := math.MaxFloat64
|
||||
cmin := -1
|
||||
for j, center := range clustering.centers {
|
||||
dist := squareDist((*clustering.data)[i], center)
|
||||
if dist < dmin {
|
||||
dmin = dist
|
||||
cmin = j
|
||||
}
|
||||
}
|
||||
lock.Lock()
|
||||
clustering.classes[i] = cmin
|
||||
clustering.sizes[cmin]++
|
||||
clustering.distmin[i] = dmin
|
||||
lock.Unlock()
|
||||
}
|
||||
|
||||
wg.Add(clustering.N())
|
||||
for i := 0; i < clustering.N(); i++ {
|
||||
go goroutine(i)
|
||||
}
|
||||
|
||||
nreset := clustering.ResetEmptyCenters()
|
||||
|
||||
if nreset > 0 {
|
||||
log.Warnf("Reset %d empty centers", nreset)
|
||||
clustering.AssignToClass()
|
||||
}
|
||||
}
|
||||
|
||||
// ComputeCenters calculates the centers of the K-means clustering algorithm.
|
||||
//
|
||||
// It takes no parameters.
|
||||
// It does not return any values.
|
||||
func (clustering *KmeansClustering) ComputeCenters() {
|
||||
var wg sync.WaitGroup
|
||||
centers := clustering.centers
|
||||
data := clustering.data
|
||||
classes := clustering.classes
|
||||
k := clustering.K()
|
||||
|
||||
// Goroutine code
|
||||
goroutine := func(centerIdx int) {
|
||||
goroutine1 := func(centerIdx int) {
|
||||
defer wg.Done()
|
||||
for j, row := range *data {
|
||||
class := classes[j]
|
||||
if class == centerIdx {
|
||||
ns[centerIdx]++
|
||||
for l, val := range row {
|
||||
centers[centerIdx][l] += val
|
||||
}
|
||||
@ -99,149 +196,168 @@ func ComputeCenters(data *obiutils.Matrix[float64], k int, classes []int) *obiut
|
||||
|
||||
for i := 0; i < k; i++ {
|
||||
wg.Add(1)
|
||||
go goroutine(i)
|
||||
go goroutine1(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
for i := range centers {
|
||||
for j := range centers[i] {
|
||||
centers[i][j] /= float64(ns[i])
|
||||
centers[i][j] /= float64(clustering.sizes[i])
|
||||
}
|
||||
}
|
||||
|
||||
return ¢ers
|
||||
}
|
||||
|
||||
// ComputeInertia computes the inertia of the given data and centers in parallel.
|
||||
//
|
||||
// Parameters:
|
||||
// - data: A pointer to a Matrix of float64 representing the data.
|
||||
// - classes: A slice of int representing the class labels for each data point.
|
||||
// - centers: A pointer to a Matrix of float64 representing the centers.
|
||||
//
|
||||
// Return type:
|
||||
// - float64: The computed inertia.
|
||||
func ComputeInertia(data *obiutils.Matrix[float64], classes []int, centers *obiutils.Matrix[float64]) float64 {
|
||||
inertia := make(chan float64)
|
||||
numRows := len(*data)
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(numRows)
|
||||
|
||||
for i := 0; i < numRows; i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
row := (*data)[i]
|
||||
class := classes[i]
|
||||
center := (*centers)[class]
|
||||
inertiaLocal := 0.0
|
||||
for j, val := range row {
|
||||
diff := val - center[j]
|
||||
inertiaLocal += diff * diff
|
||||
goroutine2 := func(centerIdx int) {
|
||||
defer wg.Done()
|
||||
dkmin := math.MaxFloat64
|
||||
dki := -1
|
||||
center := centers[centerIdx]
|
||||
for j, row := range *data {
|
||||
if classes[j] == centerIdx {
|
||||
dist := squareDist(row, center)
|
||||
if dist < dkmin {
|
||||
dkmin = dist
|
||||
dki = j
|
||||
}
|
||||
}
|
||||
inertia <- inertiaLocal
|
||||
}(i)
|
||||
}
|
||||
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(inertia)
|
||||
}()
|
||||
|
||||
totalInertia := 0.0
|
||||
for localInertia := range inertia {
|
||||
totalInertia += localInertia
|
||||
}
|
||||
|
||||
return totalInertia
|
||||
}
|
||||
|
||||
// Kmeans performs the K-means clustering algorithm on the given data.
|
||||
//
|
||||
// if centers and *center is not nil, centers is considered as initialized
|
||||
// and the number of classes (k) is set to the number of rows in centers.
|
||||
// overwise, the number of classes is defined by the value of k.
|
||||
//
|
||||
// Parameters:
|
||||
// - data: A pointer to a Matrix[float64] that represents the input data.
|
||||
// - k: An integer that specifies the number of clusters to create.
|
||||
// - threshold: A float64 value that determines the convergence threshold.
|
||||
// - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
|
||||
//
|
||||
// Returns:
|
||||
// - classes: A slice of integers that assigns each data point to a cluster.
|
||||
// - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
|
||||
// - inertia: A float64 value that represents the overall inertia of the clustering.
|
||||
// - converged: A boolean value indicating whether the algorithm converged.
|
||||
func Kmeans(data *obiutils.Matrix[float64],
|
||||
k int,
|
||||
threshold float64,
|
||||
centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
|
||||
if centers == nil || *centers == nil {
|
||||
*centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
|
||||
center_ids := SampleIntWithoutReplacement(k, len(*data))
|
||||
for i, id := range center_ids {
|
||||
(*centers)[i] = (*data)[id]
|
||||
}
|
||||
} else {
|
||||
k = len(*centers)
|
||||
clustering.icenters[centerIdx] = dki
|
||||
clustering.centers[centerIdx] = (*data)[dki]
|
||||
}
|
||||
|
||||
classes := AssignToClass(data, centers)
|
||||
centers = ComputeCenters(data, k, classes)
|
||||
inertia := ComputeInertia(data, classes, centers)
|
||||
delta := threshold * 100.0
|
||||
for i := 0; i < 100 && delta > threshold; i++ {
|
||||
classes = AssignToClass(data, centers)
|
||||
centers = ComputeCenters(data, k, classes)
|
||||
newi := ComputeInertia(data, classes, centers)
|
||||
delta = inertia - newi
|
||||
inertia = newi
|
||||
log.Debugf("Inertia: %f, delta: %f", inertia, delta)
|
||||
}
|
||||
|
||||
return classes, centers, inertia, delta < threshold
|
||||
}
|
||||
|
||||
// KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel.
|
||||
//
|
||||
// It takes a matrix of data points and a matrix of centers as input.
|
||||
// The best representative is the data point that is closest to the center of the cluster.
|
||||
// Returns an array of integers containing the index of the best representative for each cluster.
|
||||
func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
|
||||
bestRepresentative := make([]int, len(*centers))
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(*centers))
|
||||
|
||||
for j, center := range *centers {
|
||||
go func(j int, center []float64) {
|
||||
defer wg.Done()
|
||||
|
||||
bestDistToCenter := math.MaxFloat64
|
||||
best := -1
|
||||
|
||||
for i, row := range *data {
|
||||
dist := 0.0
|
||||
for d, val := range row {
|
||||
diff := val - center[d]
|
||||
dist += diff * diff
|
||||
}
|
||||
if dist < bestDistToCenter {
|
||||
bestDistToCenter = dist
|
||||
best = i
|
||||
}
|
||||
}
|
||||
|
||||
if best == -1 {
|
||||
log.Fatalf("No representative found for cluster %d", j)
|
||||
}
|
||||
|
||||
bestRepresentative[j] = best
|
||||
}(j, center)
|
||||
for i := 0; i < k; i++ {
|
||||
wg.Add(1)
|
||||
go goroutine2(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
return bestRepresentative
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) Inertia() float64 {
|
||||
inertia := 0.0
|
||||
|
||||
for i := 0; i < clustering.N(); i++ {
|
||||
inertia += clustering.distmin[i]
|
||||
}
|
||||
return inertia
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) Centers() obiutils.Matrix[float64] {
|
||||
return clustering.centers
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) CentersIndices() []int {
|
||||
return clustering.icenters
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) Sizes() []int {
|
||||
return clustering.sizes
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) Classes() []int {
|
||||
return clustering.classes
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) Run(max_cycle int, threshold float64) bool {
|
||||
prev := math.MaxFloat64
|
||||
newI := clustering.Inertia()
|
||||
for i := 0; i < max_cycle && (prev-newI) > threshold; i++ {
|
||||
prev = newI
|
||||
clustering.AssignToClass()
|
||||
clustering.ComputeCenters()
|
||||
newI = clustering.Inertia()
|
||||
}
|
||||
|
||||
return (prev - newI) <= threshold
|
||||
}
|
||||
|
||||
// // Kmeans performs the K-means clustering algorithm on the given data.
|
||||
|
||||
// // if centers and *center is not nil, centers is considered as initialized
|
||||
// // and the number of classes (k) is set to the number of rows in centers.
|
||||
// // overwise, the number of classes is defined by the value of k.
|
||||
|
||||
// // Parameters:
|
||||
// // - data: A pointer to a Matrix[float64] that represents the input data.
|
||||
// // - k: An integer that specifies the number of clusters to create.
|
||||
// // - threshold: A float64 value that determines the convergence threshold.
|
||||
// // - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
|
||||
|
||||
// // Returns:
|
||||
// // - classes: A slice of integers that assigns each data point to a cluster.
|
||||
// // - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
|
||||
// // - inertia: A float64 value that represents the overall inertia of the clustering.
|
||||
// // - converged: A boolean value indicating whether the algorithm converged.
|
||||
// func Kmeans(data *obiutils.Matrix[float64],
|
||||
// k int,
|
||||
// threshold float64,
|
||||
// centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
|
||||
// if centers == nil || *centers == nil {
|
||||
// *centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
|
||||
// center_ids := SampleIntWithoutReplacement(k, len(*data))
|
||||
// for i, id := range center_ids {
|
||||
// (*centers)[i] = (*data)[id]
|
||||
// }
|
||||
// } else {
|
||||
// k = len(*centers)
|
||||
// }
|
||||
|
||||
// classes := AssignToClass(data, centers)
|
||||
// centers = ComputeCenters(data, k, classes)
|
||||
// inertia := ComputeInertia(data, classes, centers)
|
||||
// delta := threshold * 100.0
|
||||
// for i := 0; i < 100 && delta > threshold; i++ {
|
||||
// classes = AssignToClass(data, centers)
|
||||
// centers = ComputeCenters(data, k, classes)
|
||||
// newi := ComputeInertia(data, classes, centers)
|
||||
// delta = inertia - newi
|
||||
// inertia = newi
|
||||
// log.Debugf("Inertia: %f, delta: %f", inertia, delta)
|
||||
// }
|
||||
|
||||
// return classes, centers, inertia, delta < threshold
|
||||
// }
|
||||
|
||||
// // KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel.
|
||||
// //
|
||||
// // It takes a matrix of data points and a matrix of centers as input.
|
||||
// // The best representative is the data point that is closest to the center of the cluster.
|
||||
// // Returns an array of integers containing the index of the best representative for each cluster.
|
||||
// func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
|
||||
// bestRepresentative := make([]int, len(*centers))
|
||||
|
||||
// var wg sync.WaitGroup
|
||||
// wg.Add(len(*centers))
|
||||
|
||||
// for j, center := range *centers {
|
||||
// go func(j int, center []float64) {
|
||||
// defer wg.Done()
|
||||
|
||||
// bestDistToCenter := math.MaxFloat64
|
||||
// best := -1
|
||||
|
||||
// for i, row := range *data {
|
||||
// dist := 0.0
|
||||
// for d, val := range row {
|
||||
// diff := val - center[d]
|
||||
// dist += diff * diff
|
||||
// }
|
||||
// if dist < bestDistToCenter {
|
||||
// bestDistToCenter = dist
|
||||
// best = i
|
||||
// }
|
||||
// }
|
||||
|
||||
// if best == -1 {
|
||||
// log.Fatalf("No representative found for cluster %d", j)
|
||||
// }
|
||||
|
||||
// bestRepresentative[j] = best
|
||||
// }(j, center)
|
||||
// }
|
||||
|
||||
// wg.Wait()
|
||||
|
||||
// return bestRepresentative
|
||||
// }
|
||||
|
Reference in New Issue
Block a user