diff --git a/pkg/obistats/kmeans.go b/pkg/obistats/kmeans.go new file mode 100644 index 0000000..d513e9d --- /dev/null +++ b/pkg/obistats/kmeans.go @@ -0,0 +1,176 @@ +package obistats + +import ( + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" + log "github.com/sirupsen/logrus" + "math" +) + +// AssignToClass applies the nearest neighbor algorithm to assign data points to classes. +// +// Parameters: +// - data: a 2D slice of float64 representing the data points to be assigned. +// - centers: a 2D slice of float64 representing the center points for each class. +// +// Return: +// - classes: a slice of int representing the assigned class for each data point. +func AssignToClass(data, centers *obiutils.Matrix[float64]) []int { + classes := make([]int, len(*data)) + for i, rowData := range *data { + minDist := math.MaxFloat64 + for j, centerData := range *centers { + dist := 0.0 + for d, val := range rowData { + dist += math.Pow(val-centerData[d], 2) + } + if dist < minDist { + minDist = dist + classes[i] = j + } + } + } + return classes +} + +// ComputeCenters calculates the centers of clusters for a given data set. +// +// Parameters: +// - data: a pointer to a matrix of float64 values representing the data set. +// - k: an integer representing the number of clusters. +// - classes: a slice of integers representing the assigned cluster for each data point. +// +// Returns: +// - centers: a pointer to a matrix of float64 values representing the centers of the clusters. +func ComputeCenters(data *obiutils.Matrix[float64], k int, classes []int) *obiutils.Matrix[float64] { + centers := obiutils.Make2DArray[float64](k, len((*data)[0])) + centers.Init(0.0) + ns := make([]int, k) + + for i := range ns { + ns[i] = 0 + } + + for i, row := range *data { + ns[classes[i]]++ + for j, val := range row { + centers[classes[i]][j] += val + } + } + + for i := range centers { + for j := range centers[i] { + centers[i][j] /= float64(ns[i]) + } + } + + return ¢ers +} + +// ComputeInertia computes the inertia of the given data and centers. +// +// Parameters: +// - data: A pointer to a Matrix of float64 representing the data. +// - centers: A pointer to a Matrix of float64 representing the centers. +// +// Return type: +// - float64: The computed inertia. +func ComputeInertia(data *obiutils.Matrix[float64], classes []int, centers *obiutils.Matrix[float64]) float64 { + inertia := 0.0 + for i, row := range *data { + for j, val := range row { + inertia += math.Pow(val-(*centers)[classes[i]][j], 2) + } + } + return inertia +} + +// Kmeans performs the k-means clustering algorithm on the given data. +// +// if centers and *center is not nil, centers is considered as initialized +// and the number of classes (k) is set to the number of rows in centers. +// overwise, the number of classes is defined by the value of k. +// +// Parameters: +// - data: A pointer to a matrix containing the input data. +// - k: An integer representing the number of clusters. +// - centers: A pointer to a matrix representing the initial cluster centers. +// +// Returns: +// - A slice of integers representing the assigned class labels for each data point. +// - A pointer to a matrix representing the final cluster centers. + +func Kmeans(data *obiutils.Matrix[float64], + k int, + // Kmeans performs the K-means clustering algorithm on the given data. + // + // if centers and *center is not nil, centers is considered as initialized + // and the number of classes (k) is set to the number of rows in centers. + // overwise, the number of classes is defined by the value of k. + // + // Parameters: + // - data: A pointer to a Matrix[float64] that represents the input data. + // - k: An integer that specifies the number of clusters to create. + // - threshold: A float64 value that determines the convergence threshold. + // - centers: A pointer to a Matrix[float64] that represents the initial cluster centers. + // + // Returns: + // - classes: A slice of integers that assigns each data point to a cluster. + // - centers: A pointer to a Matrix[float64] that contains the final cluster centers. + // - inertia: A float64 value that represents the overall inertia of the clustering. + // - converged: A boolean value indicating whether the algorithm converged. + threshold float64, + centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) { + if centers == nil || *centers == nil { + *centers = obiutils.Make2DArray[float64](k, len((*data)[0])) + center_ids := SampleIntWithoutReplacemant(k, len(*data)) + for i, id := range center_ids { + (*centers)[i] = (*data)[id] + } + } else { + k = len(*centers) + } + + classes := AssignToClass(data, centers) + centers = ComputeCenters(data, k, classes) + inertia := ComputeInertia(data, classes, centers) + delta := threshold * 100.0 + for i := 0; i < 100 && delta > threshold; i++ { + classes = AssignToClass(data, centers) + centers = ComputeCenters(data, k, classes) + newi := ComputeInertia(data, classes, centers) + delta = inertia - newi + inertia = newi + log.Debugf("Inertia: %f, delta: %f", inertia, delta) + } + + return classes, centers, inertia, delta < threshold +} + +// KmeansBestRepresentative finds the best representative among the data point of each cluster. +// +// It takes a matrix of data points and a matrix of centers as input. +// The best representative is the data point that is closest to the center of the cluster. +// Returns an array of integers containing the index of the best representative for each cluster. +func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int { + best_dist_to_centers := make([]float64, len(*centers)) + best_representative := make([]int, len(*centers)) + + for i := range best_dist_to_centers { + best_dist_to_centers[i] = math.MaxFloat64 + } + + for i, row := range *data { + for j, center := range *centers { + dist := 0.0 + for d, val := range row { + dist += math.Pow(val-center[d], 2) + } + if dist < best_dist_to_centers[j] { + best_dist_to_centers[j] = dist + best_representative[j] = i + } + } + } + + return best_representative +} diff --git a/pkg/obistats/random.go b/pkg/obistats/random.go new file mode 100644 index 0000000..5ecbf45 --- /dev/null +++ b/pkg/obistats/random.go @@ -0,0 +1,25 @@ +package obistats + +import "math/rand" + +func SampleIntWithoutReplacemant(n, max int) []int { + + draw := make(map[int]int, n) + + for i := 0; i < n; i++ { + y := rand.Intn(max) + x, ok := draw[y] + if ok { + y = x + } + draw[y] = max - 1 + max-- + } + + res := make([]int, 0, n) + for i := range draw { + res = append(res, i) + } + + return res +} diff --git a/pkg/obitools/obilandmark/obilandmark.go b/pkg/obitools/obilandmark/obilandmark.go new file mode 100644 index 0000000..bec88a6 --- /dev/null +++ b/pkg/obitools/obilandmark/obilandmark.go @@ -0,0 +1,139 @@ +package obilandmark + +import ( + "math" + "os" + "sort" + "sync" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obistats" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" + "github.com/schollz/progressbar/v3" + log "github.com/sirupsen/logrus" +) + +func MapOnLandmarkSequences(library obiseq.BioSequenceSlice, landmark_idx []int, sizes ...int) obiutils.Matrix[float64] { + nworkers := obioptions.CLIParallelWorkers() + + if len(sizes) > 0 { + nworkers = sizes[0] + } + + library_size := len(library) + n_landmark := len(landmark_idx) + todo := make(chan int, 0) + + seqworld := obiutils.Make2DArray[float64](library_size, n_landmark) + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowCount(), + progressbar.OptionShowIts(), + progressbar.OptionSetDescription("[Sequence mapping]"), + ) + + bar := progressbar.NewOptions(library_size, pbopt...) + + waiting := sync.WaitGroup{} + waiting.Add(nworkers) + + compute_coordinates := func() { + buffer := make([]uint64, 1000) + for i := range todo { + seq := library[i] + coord := seqworld[i] + for j := 0; j < n_landmark; j++ { + landmark := library[landmark_idx[j]] + match, lalign := obialign.FastLCSScore(landmark, seq, -1, &buffer) + coord[j] = float64(lalign - match) + } + bar.Add(1) + } + waiting.Done() + } + + for i := 0; i < nworkers; i++ { + go compute_coordinates() + } + + for i := 0; i < library_size; i++ { + todo <- i + } + + close(todo) + + waiting.Wait() + + return seqworld +} + +func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence { + + library := iterator.Load() + + library_size := len(library) + n_landmark := NCenter() + + landmark_idx := obistats.SampleIntWithoutReplacemant(n_landmark, library_size) + log.Infof("Library contains %d sequence", len(library)) + + var seqworld obiutils.Matrix[float64] + + for loop := 0; loop < 5; loop++ { + sort.IntSlice(landmark_idx).Sort() + log.Infof("Selected indices : %v", landmark_idx) + + seqworld = MapOnLandmarkSequences(library, landmark_idx) + initialCenters := obiutils.Make2DArray[float64](n_landmark, n_landmark) + for i, seq_idx := range landmark_idx { + initialCenters[i] = seqworld[seq_idx] + } + + // classes, centers := obistats.Kmeans(&seqworld, n_landmark, &initialCenters) + _, centers, inertia, converged := obistats.Kmeans(&seqworld, n_landmark, 0.001, &initialCenters) + + dist_centers := 0.0 + for i := 0; i < n_landmark; i++ { + for j := 0; j < n_landmark; j++ { + dist_centers += math.Pow((*centers)[i][j]-initialCenters[i][j], 2) + } + } + + landmark_idx = obistats.KmeansBestRepresentative(&seqworld, centers) + log.Infof("Inertia: %f, Dist centers: %f, converged: %t", inertia, dist_centers, converged) + + } + + sort.IntSlice(landmark_idx).Sort() + + log.Infof("Selected indices : %v", landmark_idx) + seqworld = MapOnLandmarkSequences(library, landmark_idx) + + seq_landmark := make(map[int]int, n_landmark) + for i, val := range landmark_idx { + seq_landmark[val] = i + } + + initialCenters := obiutils.Make2DArray[float64](n_landmark, n_landmark) + for i, seq_idx := range landmark_idx { + initialCenters[i] = seqworld[seq_idx] + } + + classes := obistats.AssignToClass(&seqworld, &initialCenters) + for i, seq := range library { + seq.SetAttribute("landmark_coord", seqworld[i]) + seq.SetAttribute("landmark_class", classes[i]) + if i, ok := seq_landmark[i]; ok { + seq.SetAttribute("landmark_id", i) + } + } + + return obiiter.IBatchOver(library, obioptions.CLIBatchSize()) + +} diff --git a/pkg/obitools/obilandmark/options.go b/pkg/obitools/obilandmark/options.go new file mode 100644 index 0000000..bdeff8e --- /dev/null +++ b/pkg/obitools/obilandmark/options.go @@ -0,0 +1,29 @@ +package obilandmark + +import ( + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +var _nCenter = 200 + +// ObilandmarkOptionSet sets the options for Obilandmark. +// +// options: a pointer to the getoptions.GetOpt struct. +// Return type: none. +func ObilandmarkOptionSet(options *getoptions.GetOpt) { + + options.IntVar(&_nCenter, "center", _nCenter, + options.Alias("n"), + options.Description("Maximum numbers of differences between two variant sequences (default: %d).")) +} + +func OptionSet(options *getoptions.GetOpt) { + obiconvert.InputOptionSet(options) + obiconvert.OutputOptionSet(options) + ObilandmarkOptionSet(options) +} + +func NCenter() int { + return _nCenter +} diff --git a/pkg/obiutils/array.go b/pkg/obiutils/array.go index 0fa4346..e1635a5 100644 --- a/pkg/obiutils/array.go +++ b/pkg/obiutils/array.go @@ -22,6 +22,17 @@ func Make2DArray[T any](rows, cols int) Matrix[T] { return matrix } +// Init initializes the Matrix with the given value. +// +// value: the value to initialize the Matrix elements with. +func (matrix *Matrix[T]) Init(value T) { + data := (*matrix)[0] + data = data[0:cap(data)] + for i := range data { + data[i] = value + } +} + // Row returns the i-th row of the matrix. // // Parameters: @@ -38,3 +49,11 @@ func (matrix *Matrix[T]) Column(i int) []T { } return r } + +// Dim returns the dimensions of the Matrix. +// +// It takes no parameters. +// It returns two integers: the number of rows and the number of columns. +func (matrix *Matrix[T]) Dim() (int, int) { + return len(*matrix), len((*matrix)[0]) +}