mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
494 lines
14 KiB
Go
494 lines
14 KiB
Go
package obistats
|
|
|
|
import (
|
|
"math"
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/exp/rand"
|
|
"gonum.org/v1/gonum/stat/sampleuv"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obilog"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
// SquareDist calculates the squared Euclidean distance between
|
|
// two vectors 'a' and 'b'.
|
|
//
|
|
// 'a' and 'b' are slices of float64 or int values representing
|
|
// coordinate points in space. It is assumed that both slices
|
|
// have the same length.
|
|
// Returns the calculated squared distance as a float64.
|
|
func SquareDist[T float64 | int](a, b []T) T {
|
|
sum := T(0)
|
|
for i, v := range a {
|
|
diff := v - b[i]
|
|
sum += diff * diff
|
|
}
|
|
return sum
|
|
}
|
|
|
|
// EuclideanDist calculates the Euclidean distance between
|
|
// two vectors represented as slices of float64.
|
|
//
|
|
// `a` and `b` are slices of float64 where each element of `a`
|
|
// is paired with the corresponding element of `b`.
|
|
// Returns the squared sum of the differences.
|
|
func EuclideanDist[T float64 | int](a, b []T) float64 {
|
|
return math.Sqrt(float64(SquareDist(a, b)))
|
|
}
|
|
|
|
// DefaultRG creates and returns a new instance of *rand.Rand.
|
|
//
|
|
// No parameters.
|
|
// Returns *rand.Rand which is a pointer to a new random number
|
|
// generator, seeded with the current time in nanoseconds.
|
|
func DefaultRG() *rand.Rand {
|
|
return rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
|
|
}
|
|
|
|
type KmeansClustering struct {
|
|
data *obiutils.Matrix[float64] // data matrix dimensions: n x p
|
|
distmin []float64 // distance to closest center dimension: n
|
|
classes []int // class of each data point dimension: n
|
|
rg *rand.Rand // random number generator
|
|
centers obiutils.Matrix[float64] // centers coordinates dimensions: k x p
|
|
icenters []int // indices of centers dimension: k
|
|
sizes []int // number of elements in each cluster dimension: k
|
|
}
|
|
|
|
// MakeKmeansClustering initializes a KmeansClustering with the
|
|
// provided matrix data, number of clusters k, and random number
|
|
// generator rg.
|
|
//
|
|
// data is a pointer to a Matrix of float64 representing the dataset,
|
|
// k is the number of desired clusters, and rg is a pointer to a
|
|
// random number generator used in the clustering process.
|
|
// Returns a pointer to the initialized KmeansClustering structure.
|
|
func MakeKmeansClustering(data *obiutils.Matrix[float64], k int, rg *rand.Rand) *KmeansClustering {
|
|
distmin := make([]float64, len(*data))
|
|
classes := make([]int, len(*data))
|
|
for i := 0; i < len(distmin); i++ {
|
|
distmin[i] = math.MaxFloat64
|
|
classes[i] = -1
|
|
}
|
|
|
|
clustering := &KmeansClustering{
|
|
data: data,
|
|
distmin: distmin,
|
|
classes: classes,
|
|
rg: rg,
|
|
centers: make(obiutils.Matrix[float64], 0, k),
|
|
icenters: make([]int, 0, k),
|
|
sizes: make([]int, 0, k),
|
|
}
|
|
|
|
for i := 0; i < k; i++ {
|
|
clustering.AddACenter()
|
|
}
|
|
|
|
return clustering
|
|
}
|
|
|
|
// K returns the number of clusters in the K-means clustering algorithm.
|
|
//
|
|
// No parameters.
|
|
// Returns an integer.
|
|
func (clustering *KmeansClustering) K() int {
|
|
return len(clustering.icenters)
|
|
}
|
|
|
|
// N returns the size of the dataset in the KmeansClustering instance.
|
|
//
|
|
// It does not take any parameters.
|
|
// The return type is an integer.
|
|
func (clustering *KmeansClustering) N() int {
|
|
return len(*clustering.data)
|
|
}
|
|
|
|
// Dimension returns the dimension of the KmeansClustering data.
|
|
//
|
|
// No parameters.
|
|
// Returns an integer representing the dimension of the data.
|
|
func (clustering *KmeansClustering) Dimension() int {
|
|
return len((*clustering.data)[0])
|
|
}
|
|
|
|
// SetCenterTo sets the center of a specific cluster to a given data point index.
|
|
//
|
|
// Parameters:
|
|
// - k: the index of the cluster, if k=-1, a new center is added
|
|
// - i: the index of the data point
|
|
// - reset: a boolean indicating whether to reset the distances to the nearest center
|
|
// for points previously assigned to this center
|
|
//
|
|
// No return value.
|
|
|
|
func (clustering *KmeansClustering) SetCenterTo(k, i int, reset bool) {
|
|
N := clustering.N()
|
|
K := clustering.K()
|
|
center := (*clustering.data)[i]
|
|
|
|
if k >= 0 {
|
|
clustering.icenters[k] = i
|
|
clustering.sizes[k] = 0
|
|
clustering.centers[k] = center
|
|
|
|
if reset {
|
|
// Recompute distances to the nearest center for points
|
|
// previously assigned to this center
|
|
K := clustering.K()
|
|
for j := 0; j < N; j++ {
|
|
if clustering.classes[j] == k {
|
|
clustering.distmin[j] = math.MaxFloat64
|
|
for l := 1; l < K; l++ {
|
|
dist := EuclideanDist((*clustering.data)[j], clustering.centers[l])
|
|
if dist < clustering.distmin[j] {
|
|
clustering.distmin[j] = dist
|
|
clustering.classes[j] = l
|
|
}
|
|
}
|
|
clustering.sizes[clustering.classes[j]]++
|
|
}
|
|
}
|
|
}
|
|
|
|
} else {
|
|
clustering.icenters = append(clustering.icenters, i)
|
|
clustering.sizes = append(clustering.sizes, 0)
|
|
clustering.centers = append(clustering.centers, center)
|
|
k = K
|
|
K++
|
|
}
|
|
|
|
for j := 0; j < clustering.N(); j++ {
|
|
dist := EuclideanDist((*clustering.data)[j], center)
|
|
if dist < clustering.distmin[j] {
|
|
if C := clustering.classes[j]; C >= 0 {
|
|
clustering.sizes[C]--
|
|
}
|
|
clustering.distmin[j] = dist
|
|
clustering.classes[j] = k
|
|
clustering.sizes[k]++
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
// AddACenter adds a new center to the KmeansClustering.
|
|
//
|
|
// If there are no centers, it randomly selects a new center.
|
|
// If there are existing centers, it selects a new center with
|
|
// probability proportional to its distance from the nearest
|
|
// center. The center is then added to the clustering.
|
|
func (clustering *KmeansClustering) AddACenter() {
|
|
k := clustering.K()
|
|
C := 0
|
|
|
|
if k == 0 {
|
|
// if there are no centers yet, draw a sample as the first center
|
|
C = rand.Intn(clustering.N())
|
|
} else {
|
|
// otherwise, draw a sample with a probability proportional
|
|
// to its closest distance to a center
|
|
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
|
|
C, _ = w.Take()
|
|
}
|
|
|
|
clustering.SetCenterTo(-1, C, false)
|
|
}
|
|
|
|
// ResetEmptyCenters reinitializes any centers in a KmeansClustering
|
|
// that have no assigned points.
|
|
//
|
|
// This method iterates over the centers and uses a weighted sampling
|
|
// to reset centers with a size of zero.
|
|
// Returns the number of centers that were reset.
|
|
func (clustering *KmeansClustering) ResetEmptyCenters() int {
|
|
nreset := 0
|
|
for i := 0; i < clustering.K(); i++ {
|
|
if clustering.sizes[i] == 0 {
|
|
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
|
|
C, _ := w.Take()
|
|
clustering.SetCenterTo(i, C, false)
|
|
nreset++
|
|
}
|
|
}
|
|
return nreset
|
|
}
|
|
|
|
// ClosestPoint finds the index of the closest point in the
|
|
// clustering to the given coordinates.
|
|
//
|
|
// coordinates is a slice of float64 representing the point.
|
|
// Returns the index of the closest point as an int.
|
|
func (clustering *KmeansClustering) ClosestPoint(coordinates []float64) int {
|
|
N := clustering.N()
|
|
distmin := math.MaxFloat64
|
|
C := -1
|
|
for i := 0; i < N; i++ {
|
|
dist := EuclideanDist((*clustering.data)[i], coordinates)
|
|
if dist < distmin {
|
|
distmin = dist
|
|
C = i
|
|
}
|
|
}
|
|
return C
|
|
}
|
|
|
|
// AssignToClass assigns each data point in the dataset to the nearest
|
|
// center (class) in a K-means clustering algorithm.
|
|
//
|
|
// Handles the reinitialization of empty centers after the assignment.
|
|
// No return values.
|
|
func (clustering *KmeansClustering) AssignToClass() {
|
|
var wg sync.WaitGroup
|
|
var lock sync.Mutex
|
|
|
|
// initialize the number of points in each class
|
|
for i := 0; i < clustering.K(); i++ {
|
|
clustering.sizes[i] = 0
|
|
}
|
|
|
|
goroutine := func(i int) {
|
|
defer wg.Done()
|
|
dmin := math.MaxFloat64
|
|
cmin := -1
|
|
for j, center := range clustering.centers {
|
|
dist := EuclideanDist((*clustering.data)[i], center)
|
|
if dist < dmin {
|
|
dmin = dist
|
|
cmin = j
|
|
}
|
|
}
|
|
|
|
clustering.classes[i] = cmin
|
|
clustering.distmin[i] = dmin
|
|
|
|
lock.Lock()
|
|
clustering.sizes[cmin]++
|
|
lock.Unlock()
|
|
}
|
|
|
|
wg.Add(clustering.N())
|
|
for i := 0; i < clustering.N(); i++ {
|
|
go goroutine(i)
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
nreset := clustering.ResetEmptyCenters()
|
|
|
|
if nreset > 0 {
|
|
obilog.Warnf("Reseted %d empty centers", nreset)
|
|
}
|
|
}
|
|
|
|
// SetCentersTo assigns new centers in the KmeansClustering
|
|
// structure given a slice of indices.
|
|
//
|
|
// The indices parameter is a slice of integers that
|
|
// corresponds to the new indices of the cluster centers in
|
|
// the dataset. It panics if any index is out of bounds.
|
|
// This method does not return any value.
|
|
func (clustering *KmeansClustering) SetCentersTo(indices []int) {
|
|
for _, v := range indices {
|
|
if v < 0 || v >= clustering.N() {
|
|
log.Fatalf("Invalid center index: %d", v)
|
|
}
|
|
}
|
|
|
|
clustering.icenters = indices
|
|
K := len(indices)
|
|
|
|
for i := 0; i < K; i++ {
|
|
clustering.centers[i] = (*clustering.data)[indices[i]]
|
|
}
|
|
|
|
clustering.AssignToClass()
|
|
|
|
}
|
|
|
|
// ComputeCenters calculates the centers of the K-means clustering algorithm.
|
|
//
|
|
// This method call AssignToClass() after computing the centers to ensure coherence
|
|
// of the clustering data structure.
|
|
//
|
|
// It takes no parameters.
|
|
// It does not return any values.
|
|
func (clustering *KmeansClustering) ComputeCenters() {
|
|
var wg sync.WaitGroup
|
|
centers := clustering.centers
|
|
data := clustering.data
|
|
classes := clustering.classes
|
|
K := clustering.K()
|
|
|
|
// compute the location of center of class centerIdx
|
|
// as the point in the data the closest to the
|
|
// center of class centerIdx
|
|
newCenter := func(centerIdx int) {
|
|
defer wg.Done()
|
|
|
|
center := make([]float64, clustering.Dimension())
|
|
|
|
for j := range center {
|
|
center[j] = 0
|
|
}
|
|
|
|
for j, row := range *data {
|
|
if classes[j] == centerIdx {
|
|
for l, val := range row {
|
|
center[l] += val
|
|
}
|
|
}
|
|
}
|
|
|
|
for j := range centers[centerIdx] {
|
|
center[j] /= float64(clustering.sizes[centerIdx])
|
|
}
|
|
|
|
C := clustering.ClosestPoint(center)
|
|
|
|
centers[centerIdx] = (*data)[C]
|
|
clustering.icenters[centerIdx] = C
|
|
}
|
|
|
|
for i := 0; i < K; i++ {
|
|
wg.Add(1)
|
|
go newCenter(i)
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
clustering.AssignToClass()
|
|
|
|
}
|
|
|
|
func (clustering *KmeansClustering) Inertia() float64 {
|
|
inertia := 0.0
|
|
|
|
for i := 0; i < clustering.N(); i++ {
|
|
inertia += clustering.distmin[i] * clustering.distmin[i]
|
|
}
|
|
return inertia
|
|
}
|
|
|
|
func (clustering *KmeansClustering) Centers() obiutils.Matrix[float64] {
|
|
return clustering.centers
|
|
}
|
|
|
|
func (clustering *KmeansClustering) CentersIndices() []int {
|
|
return clustering.icenters
|
|
}
|
|
|
|
func (clustering *KmeansClustering) Sizes() []int {
|
|
return clustering.sizes
|
|
}
|
|
|
|
func (clustering *KmeansClustering) Classes() []int {
|
|
return clustering.classes
|
|
}
|
|
|
|
func (clustering *KmeansClustering) Run(max_cycle int, threshold float64) bool {
|
|
prev := math.MaxFloat64
|
|
newI := clustering.Inertia()
|
|
for i := 0; i < max_cycle && (prev-newI) > threshold; i++ {
|
|
prev = newI
|
|
clustering.ComputeCenters()
|
|
newI = clustering.Inertia()
|
|
}
|
|
|
|
return (prev - newI) <= threshold
|
|
}
|
|
|
|
// // Kmeans performs the K-means clustering algorithm on the given data.
|
|
|
|
// // if centers and *center is not nil, centers is considered as initialized
|
|
// // and the number of classes (k) is set to the number of rows in centers.
|
|
// // overwise, the number of classes is defined by the value of k.
|
|
|
|
// // Parameters:
|
|
// // - data: A pointer to a Matrix[float64] that represents the input data.
|
|
// // - k: An integer that specifies the number of clusters to create.
|
|
// // - threshold: A float64 value that determines the convergence threshold.
|
|
// // - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
|
|
|
|
// // Returns:
|
|
// // - classes: A slice of integers that assigns each data point to a cluster.
|
|
// // - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
|
|
// // - inertia: A float64 value that represents the overall inertia of the clustering.
|
|
// // - converged: A boolean value indicating whether the algorithm converged.
|
|
// func Kmeans(data *obiutils.Matrix[float64],
|
|
// k int,
|
|
// threshold float64,
|
|
// centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
|
|
// if centers == nil || *centers == nil {
|
|
// *centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
|
|
// center_ids := SampleIntWithoutReplacement(k, len(*data))
|
|
// for i, id := range center_ids {
|
|
// (*centers)[i] = (*data)[id]
|
|
// }
|
|
// } else {
|
|
// k = len(*centers)
|
|
// }
|
|
|
|
// classes := AssignToClass(data, centers)
|
|
// centers = ComputeCenters(data, k, classes)
|
|
// inertia := ComputeInertia(data, classes, centers)
|
|
// delta := threshold * 100.0
|
|
// for i := 0; i < 100 && delta > threshold; i++ {
|
|
// classes = AssignToClass(data, centers)
|
|
// centers = ComputeCenters(data, k, classes)
|
|
// newi := ComputeInertia(data, classes, centers)
|
|
// delta = inertia - newi
|
|
// inertia = newi
|
|
// log.Debugf("Inertia: %f, delta: %f", inertia, delta)
|
|
// }
|
|
|
|
// return classes, centers, inertia, delta < threshold
|
|
// }
|
|
|
|
// // KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel.
|
|
// //
|
|
// // It takes a matrix of data points and a matrix of centers as input.
|
|
// // The best representative is the data point that is closest to the center of the cluster.
|
|
// // Returns an array of integers containing the index of the best representative for each cluster.
|
|
// func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
|
|
// bestRepresentative := make([]int, len(*centers))
|
|
|
|
// var wg sync.WaitGroup
|
|
// wg.Add(len(*centers))
|
|
|
|
// for j, center := range *centers {
|
|
// go func(j int, center []float64) {
|
|
// defer wg.Done()
|
|
|
|
// bestDistToCenter := math.MaxFloat64
|
|
// best := -1
|
|
|
|
// for i, row := range *data {
|
|
// dist := 0.0
|
|
// for d, val := range row {
|
|
// diff := val - center[d]
|
|
// dist += diff * diff
|
|
// }
|
|
// if dist < bestDistToCenter {
|
|
// bestDistToCenter = dist
|
|
// best = i
|
|
// }
|
|
// }
|
|
|
|
// if best == -1 {
|
|
// log.Fatalf("No representative found for cluster %d", j)
|
|
// }
|
|
|
|
// bestRepresentative[j] = best
|
|
// }(j, center)
|
|
// }
|
|
|
|
// wg.Wait()
|
|
|
|
// return bestRepresentative
|
|
// }
|