mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
first trial of obilandmark
Former-commit-id: 00a50bdbf407b03dfdc385a848a536559f5966a5
This commit is contained in:
176
pkg/obistats/kmeans.go
Normal file
176
pkg/obistats/kmeans.go
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
package obistats
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
"math"
|
||||||
|
)
|
||||||
|
|
||||||
|
// AssignToClass applies the nearest neighbor algorithm to assign data points to classes.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - data: a 2D slice of float64 representing the data points to be assigned.
|
||||||
|
// - centers: a 2D slice of float64 representing the center points for each class.
|
||||||
|
//
|
||||||
|
// Return:
|
||||||
|
// - classes: a slice of int representing the assigned class for each data point.
|
||||||
|
func AssignToClass(data, centers *obiutils.Matrix[float64]) []int {
|
||||||
|
classes := make([]int, len(*data))
|
||||||
|
for i, rowData := range *data {
|
||||||
|
minDist := math.MaxFloat64
|
||||||
|
for j, centerData := range *centers {
|
||||||
|
dist := 0.0
|
||||||
|
for d, val := range rowData {
|
||||||
|
dist += math.Pow(val-centerData[d], 2)
|
||||||
|
}
|
||||||
|
if dist < minDist {
|
||||||
|
minDist = dist
|
||||||
|
classes[i] = j
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return classes
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComputeCenters calculates the centers of clusters for a given data set.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - data: a pointer to a matrix of float64 values representing the data set.
|
||||||
|
// - k: an integer representing the number of clusters.
|
||||||
|
// - classes: a slice of integers representing the assigned cluster for each data point.
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - centers: a pointer to a matrix of float64 values representing the centers of the clusters.
|
||||||
|
func ComputeCenters(data *obiutils.Matrix[float64], k int, classes []int) *obiutils.Matrix[float64] {
|
||||||
|
centers := obiutils.Make2DArray[float64](k, len((*data)[0]))
|
||||||
|
centers.Init(0.0)
|
||||||
|
ns := make([]int, k)
|
||||||
|
|
||||||
|
for i := range ns {
|
||||||
|
ns[i] = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, row := range *data {
|
||||||
|
ns[classes[i]]++
|
||||||
|
for j, val := range row {
|
||||||
|
centers[classes[i]][j] += val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range centers {
|
||||||
|
for j := range centers[i] {
|
||||||
|
centers[i][j] /= float64(ns[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ¢ers
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComputeInertia computes the inertia of the given data and centers.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - data: A pointer to a Matrix of float64 representing the data.
|
||||||
|
// - centers: A pointer to a Matrix of float64 representing the centers.
|
||||||
|
//
|
||||||
|
// Return type:
|
||||||
|
// - float64: The computed inertia.
|
||||||
|
func ComputeInertia(data *obiutils.Matrix[float64], classes []int, centers *obiutils.Matrix[float64]) float64 {
|
||||||
|
inertia := 0.0
|
||||||
|
for i, row := range *data {
|
||||||
|
for j, val := range row {
|
||||||
|
inertia += math.Pow(val-(*centers)[classes[i]][j], 2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return inertia
|
||||||
|
}
|
||||||
|
|
||||||
|
// Kmeans performs the k-means clustering algorithm on the given data.
|
||||||
|
//
|
||||||
|
// if centers and *center is not nil, centers is considered as initialized
|
||||||
|
// and the number of classes (k) is set to the number of rows in centers.
|
||||||
|
// overwise, the number of classes is defined by the value of k.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - data: A pointer to a matrix containing the input data.
|
||||||
|
// - k: An integer representing the number of clusters.
|
||||||
|
// - centers: A pointer to a matrix representing the initial cluster centers.
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - A slice of integers representing the assigned class labels for each data point.
|
||||||
|
// - A pointer to a matrix representing the final cluster centers.
|
||||||
|
|
||||||
|
func Kmeans(data *obiutils.Matrix[float64],
|
||||||
|
k int,
|
||||||
|
// Kmeans performs the K-means clustering algorithm on the given data.
|
||||||
|
//
|
||||||
|
// if centers and *center is not nil, centers is considered as initialized
|
||||||
|
// and the number of classes (k) is set to the number of rows in centers.
|
||||||
|
// overwise, the number of classes is defined by the value of k.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - data: A pointer to a Matrix[float64] that represents the input data.
|
||||||
|
// - k: An integer that specifies the number of clusters to create.
|
||||||
|
// - threshold: A float64 value that determines the convergence threshold.
|
||||||
|
// - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - classes: A slice of integers that assigns each data point to a cluster.
|
||||||
|
// - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
|
||||||
|
// - inertia: A float64 value that represents the overall inertia of the clustering.
|
||||||
|
// - converged: A boolean value indicating whether the algorithm converged.
|
||||||
|
threshold float64,
|
||||||
|
centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
|
||||||
|
if centers == nil || *centers == nil {
|
||||||
|
*centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
|
||||||
|
center_ids := SampleIntWithoutReplacemant(k, len(*data))
|
||||||
|
for i, id := range center_ids {
|
||||||
|
(*centers)[i] = (*data)[id]
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
k = len(*centers)
|
||||||
|
}
|
||||||
|
|
||||||
|
classes := AssignToClass(data, centers)
|
||||||
|
centers = ComputeCenters(data, k, classes)
|
||||||
|
inertia := ComputeInertia(data, classes, centers)
|
||||||
|
delta := threshold * 100.0
|
||||||
|
for i := 0; i < 100 && delta > threshold; i++ {
|
||||||
|
classes = AssignToClass(data, centers)
|
||||||
|
centers = ComputeCenters(data, k, classes)
|
||||||
|
newi := ComputeInertia(data, classes, centers)
|
||||||
|
delta = inertia - newi
|
||||||
|
inertia = newi
|
||||||
|
log.Debugf("Inertia: %f, delta: %f", inertia, delta)
|
||||||
|
}
|
||||||
|
|
||||||
|
return classes, centers, inertia, delta < threshold
|
||||||
|
}
|
||||||
|
|
||||||
|
// KmeansBestRepresentative finds the best representative among the data point of each cluster.
|
||||||
|
//
|
||||||
|
// It takes a matrix of data points and a matrix of centers as input.
|
||||||
|
// The best representative is the data point that is closest to the center of the cluster.
|
||||||
|
// Returns an array of integers containing the index of the best representative for each cluster.
|
||||||
|
func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
|
||||||
|
best_dist_to_centers := make([]float64, len(*centers))
|
||||||
|
best_representative := make([]int, len(*centers))
|
||||||
|
|
||||||
|
for i := range best_dist_to_centers {
|
||||||
|
best_dist_to_centers[i] = math.MaxFloat64
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, row := range *data {
|
||||||
|
for j, center := range *centers {
|
||||||
|
dist := 0.0
|
||||||
|
for d, val := range row {
|
||||||
|
dist += math.Pow(val-center[d], 2)
|
||||||
|
}
|
||||||
|
if dist < best_dist_to_centers[j] {
|
||||||
|
best_dist_to_centers[j] = dist
|
||||||
|
best_representative[j] = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return best_representative
|
||||||
|
}
|
25
pkg/obistats/random.go
Normal file
25
pkg/obistats/random.go
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
package obistats
|
||||||
|
|
||||||
|
import "math/rand"
|
||||||
|
|
||||||
|
func SampleIntWithoutReplacemant(n, max int) []int {
|
||||||
|
|
||||||
|
draw := make(map[int]int, n)
|
||||||
|
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
y := rand.Intn(max)
|
||||||
|
x, ok := draw[y]
|
||||||
|
if ok {
|
||||||
|
y = x
|
||||||
|
}
|
||||||
|
draw[y] = max - 1
|
||||||
|
max--
|
||||||
|
}
|
||||||
|
|
||||||
|
res := make([]int, 0, n)
|
||||||
|
for i := range draw {
|
||||||
|
res = append(res, i)
|
||||||
|
}
|
||||||
|
|
||||||
|
return res
|
||||||
|
}
|
139
pkg/obitools/obilandmark/obilandmark.go
Normal file
139
pkg/obitools/obilandmark/obilandmark.go
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
package obilandmark
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"sort"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obistats"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
|
"github.com/schollz/progressbar/v3"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
func MapOnLandmarkSequences(library obiseq.BioSequenceSlice, landmark_idx []int, sizes ...int) obiutils.Matrix[float64] {
|
||||||
|
nworkers := obioptions.CLIParallelWorkers()
|
||||||
|
|
||||||
|
if len(sizes) > 0 {
|
||||||
|
nworkers = sizes[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
library_size := len(library)
|
||||||
|
n_landmark := len(landmark_idx)
|
||||||
|
todo := make(chan int, 0)
|
||||||
|
|
||||||
|
seqworld := obiutils.Make2DArray[float64](library_size, n_landmark)
|
||||||
|
|
||||||
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
|
pbopt = append(pbopt,
|
||||||
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
|
progressbar.OptionSetWidth(15),
|
||||||
|
progressbar.OptionShowCount(),
|
||||||
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetDescription("[Sequence mapping]"),
|
||||||
|
)
|
||||||
|
|
||||||
|
bar := progressbar.NewOptions(library_size, pbopt...)
|
||||||
|
|
||||||
|
waiting := sync.WaitGroup{}
|
||||||
|
waiting.Add(nworkers)
|
||||||
|
|
||||||
|
compute_coordinates := func() {
|
||||||
|
buffer := make([]uint64, 1000)
|
||||||
|
for i := range todo {
|
||||||
|
seq := library[i]
|
||||||
|
coord := seqworld[i]
|
||||||
|
for j := 0; j < n_landmark; j++ {
|
||||||
|
landmark := library[landmark_idx[j]]
|
||||||
|
match, lalign := obialign.FastLCSScore(landmark, seq, -1, &buffer)
|
||||||
|
coord[j] = float64(lalign - match)
|
||||||
|
}
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
|
waiting.Done()
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < nworkers; i++ {
|
||||||
|
go compute_coordinates()
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < library_size; i++ {
|
||||||
|
todo <- i
|
||||||
|
}
|
||||||
|
|
||||||
|
close(todo)
|
||||||
|
|
||||||
|
waiting.Wait()
|
||||||
|
|
||||||
|
return seqworld
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||||
|
|
||||||
|
library := iterator.Load()
|
||||||
|
|
||||||
|
library_size := len(library)
|
||||||
|
n_landmark := NCenter()
|
||||||
|
|
||||||
|
landmark_idx := obistats.SampleIntWithoutReplacemant(n_landmark, library_size)
|
||||||
|
log.Infof("Library contains %d sequence", len(library))
|
||||||
|
|
||||||
|
var seqworld obiutils.Matrix[float64]
|
||||||
|
|
||||||
|
for loop := 0; loop < 5; loop++ {
|
||||||
|
sort.IntSlice(landmark_idx).Sort()
|
||||||
|
log.Infof("Selected indices : %v", landmark_idx)
|
||||||
|
|
||||||
|
seqworld = MapOnLandmarkSequences(library, landmark_idx)
|
||||||
|
initialCenters := obiutils.Make2DArray[float64](n_landmark, n_landmark)
|
||||||
|
for i, seq_idx := range landmark_idx {
|
||||||
|
initialCenters[i] = seqworld[seq_idx]
|
||||||
|
}
|
||||||
|
|
||||||
|
// classes, centers := obistats.Kmeans(&seqworld, n_landmark, &initialCenters)
|
||||||
|
_, centers, inertia, converged := obistats.Kmeans(&seqworld, n_landmark, 0.001, &initialCenters)
|
||||||
|
|
||||||
|
dist_centers := 0.0
|
||||||
|
for i := 0; i < n_landmark; i++ {
|
||||||
|
for j := 0; j < n_landmark; j++ {
|
||||||
|
dist_centers += math.Pow((*centers)[i][j]-initialCenters[i][j], 2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
landmark_idx = obistats.KmeansBestRepresentative(&seqworld, centers)
|
||||||
|
log.Infof("Inertia: %f, Dist centers: %f, converged: %t", inertia, dist_centers, converged)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.IntSlice(landmark_idx).Sort()
|
||||||
|
|
||||||
|
log.Infof("Selected indices : %v", landmark_idx)
|
||||||
|
seqworld = MapOnLandmarkSequences(library, landmark_idx)
|
||||||
|
|
||||||
|
seq_landmark := make(map[int]int, n_landmark)
|
||||||
|
for i, val := range landmark_idx {
|
||||||
|
seq_landmark[val] = i
|
||||||
|
}
|
||||||
|
|
||||||
|
initialCenters := obiutils.Make2DArray[float64](n_landmark, n_landmark)
|
||||||
|
for i, seq_idx := range landmark_idx {
|
||||||
|
initialCenters[i] = seqworld[seq_idx]
|
||||||
|
}
|
||||||
|
|
||||||
|
classes := obistats.AssignToClass(&seqworld, &initialCenters)
|
||||||
|
for i, seq := range library {
|
||||||
|
seq.SetAttribute("landmark_coord", seqworld[i])
|
||||||
|
seq.SetAttribute("landmark_class", classes[i])
|
||||||
|
if i, ok := seq_landmark[i]; ok {
|
||||||
|
seq.SetAttribute("landmark_id", i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return obiiter.IBatchOver(library, obioptions.CLIBatchSize())
|
||||||
|
|
||||||
|
}
|
29
pkg/obitools/obilandmark/options.go
Normal file
29
pkg/obitools/obilandmark/options.go
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
package obilandmark
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||||
|
"github.com/DavidGamba/go-getoptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
var _nCenter = 200
|
||||||
|
|
||||||
|
// ObilandmarkOptionSet sets the options for Obilandmark.
|
||||||
|
//
|
||||||
|
// options: a pointer to the getoptions.GetOpt struct.
|
||||||
|
// Return type: none.
|
||||||
|
func ObilandmarkOptionSet(options *getoptions.GetOpt) {
|
||||||
|
|
||||||
|
options.IntVar(&_nCenter, "center", _nCenter,
|
||||||
|
options.Alias("n"),
|
||||||
|
options.Description("Maximum numbers of differences between two variant sequences (default: %d)."))
|
||||||
|
}
|
||||||
|
|
||||||
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
|
obiconvert.InputOptionSet(options)
|
||||||
|
obiconvert.OutputOptionSet(options)
|
||||||
|
ObilandmarkOptionSet(options)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NCenter() int {
|
||||||
|
return _nCenter
|
||||||
|
}
|
@ -22,6 +22,17 @@ func Make2DArray[T any](rows, cols int) Matrix[T] {
|
|||||||
return matrix
|
return matrix
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Init initializes the Matrix with the given value.
|
||||||
|
//
|
||||||
|
// value: the value to initialize the Matrix elements with.
|
||||||
|
func (matrix *Matrix[T]) Init(value T) {
|
||||||
|
data := (*matrix)[0]
|
||||||
|
data = data[0:cap(data)]
|
||||||
|
for i := range data {
|
||||||
|
data[i] = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Row returns the i-th row of the matrix.
|
// Row returns the i-th row of the matrix.
|
||||||
//
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
@ -38,3 +49,11 @@ func (matrix *Matrix[T]) Column(i int) []T {
|
|||||||
}
|
}
|
||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Dim returns the dimensions of the Matrix.
|
||||||
|
//
|
||||||
|
// It takes no parameters.
|
||||||
|
// It returns two integers: the number of rows and the number of columns.
|
||||||
|
func (matrix *Matrix[T]) Dim() (int, int) {
|
||||||
|
return len(*matrix), len((*matrix)[0])
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user