From 2caaa62485dd417aa594eb131961692b71b6e075 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 11 Dec 2023 16:07:03 +0100 Subject: [PATCH] Implements the kmeans++ algo to select the landmarks in the geometric method Former-commit-id: 732404a0dc6d7276e4e479dd2481aa4bd42d4ce5 --- Release-notes.md | 6 +- pkg/obiseq/biosequenceslice.go | 10 +- pkg/obistats/kmeans.go | 297 +++++++++++++++++------- pkg/obitools/obilandmark/obilandmark.go | 35 +-- pkg/obitools/obirefidx/geomindexing.go | 15 +- pkg/obitools/obitag/obigeomtag.go | 14 +- pkg/obitools/obitag/obitag.go | 16 +- pkg/obiutils/ranks.go | 6 + 8 files changed, 259 insertions(+), 140 deletions(-) diff --git a/Release-notes.md b/Release-notes.md index 08e277d..be31c88 100644 --- a/Release-notes.md +++ b/Release-notes.md @@ -21,7 +21,11 @@ ### Enhancement -- For efficiency purposes, now the `obiuniq` command run on disk by default. Consequently, the **--on-disk** option has been replaced by **--in-memory** to ask explicitly to use memory. +- For efficiency purposes, now the `obiuniq` command run on disk by default. Consequently, the **--on-disk** + option has been replaced by **--in-memory** to ask explicitly to use memory. +- Adds an option **--penalty-scale** to the `obipairing` and `obipcrtag` command to fine tune the pairing score + in the system of the alignment procedure by applying a scaling factor to the mismatch score and the gap score + relatively to the match score. ### Bug fixes diff --git a/pkg/obiseq/biosequenceslice.go b/pkg/obiseq/biosequenceslice.go index 85bc41b..f1e90b6 100644 --- a/pkg/obiseq/biosequenceslice.go +++ b/pkg/obiseq/biosequenceslice.go @@ -31,7 +31,7 @@ func NewBioSequenceSlice(size ...int) *BioSequenceSlice { slice := _BioSequenceSlicePool.Get().(*BioSequenceSlice) if len(size) > 0 { s := size[0] - slice = slice.InsureCapacity(s) + slice = slice.EnsureCapacity(s) (*slice) = (*slice)[0:s] } return slice @@ -76,11 +76,11 @@ func (s *BioSequenceSlice) Recycle(including_seq bool) { _BioSequenceSlicePool.Put(s) } -// InsureCapacity ensures that the BioSequenceSlice has a minimum capacity +// EnsureCapacity ensures that the BioSequenceSlice has a minimum capacity // // It takes an integer `capacity` as a parameter, which represents the desired minimum capacity of the BioSequenceSlice. // It returns a pointer to the BioSequenceSlice. -func (s *BioSequenceSlice) InsureCapacity(capacity int) *BioSequenceSlice { +func (s *BioSequenceSlice) EnsureCapacity(capacity int) *BioSequenceSlice { var c int if s != nil { c = cap(*s) @@ -88,7 +88,9 @@ func (s *BioSequenceSlice) InsureCapacity(capacity int) *BioSequenceSlice { c = 0 } - *s = slices.Grow[BioSequenceSlice](*s, capacity-c) + if capacity > c { + *s = slices.Grow[BioSequenceSlice](*s, capacity-c) + } return s } diff --git a/pkg/obistats/kmeans.go b/pkg/obistats/kmeans.go index e790e8c..10d59e8 100644 --- a/pkg/obistats/kmeans.go +++ b/pkg/obistats/kmeans.go @@ -12,43 +12,75 @@ import ( log "github.com/sirupsen/logrus" ) -func squareDist(a, b []float64) float64 { +// SquareDist calculates the squared Euclidean distance between +// two vectors 'a' and 'b'. +// +// 'a' and 'b' are slices of float64 values representing +// coordinate points in space. It is assumed that both slices +// have the same length. +// Returns the calculated squared distance as a float64. +func SquareDist[T float64 | int](a, b []T) float64 { sum := 0.0 - for i := 0; i < len(a); i++ { - diff := a[i] - b[i] + for i, v := range a { + diff := float64(v - b[i]) sum += diff * diff } return sum } +// EuclideanDist calculates the Euclidean distance between +// two vectors represented as slices of float64. +// +// `a` and `b` are slices of float64 where each element of `a` +// is paired with the corresponding element of `b`. +// Returns the squared sum of the differences. +func EuclideanDist[T float64 | int](a, b []T) float64 { + return math.Sqrt(SquareDist(a, b)) +} + +// DefaultRG creates and returns a new instance of *rand.Rand. +// +// No parameters. +// Returns *rand.Rand which is a pointer to a new random number +// generator, seeded with the current time in nanoseconds. func DefaultRG() *rand.Rand { return rand.New(rand.NewSource(uint64(time.Now().UnixNano()))) } type KmeansClustering struct { - data *obiutils.Matrix[float64] - rg *rand.Rand - centers obiutils.Matrix[float64] - icenters []int - sizes []int - distmin []float64 - classes []int + data *obiutils.Matrix[float64] // data matrix dimensions: n x p + distmin []float64 // distance to closest center dimension: n + classes []int // class of each data point dimension: n + rg *rand.Rand // random number generator + centers obiutils.Matrix[float64] // centers coordinates dimensions: k x p + icenters []int // indices of centers dimension: k + sizes []int // number of elements in each cluster dimension: k } +// MakeKmeansClustering initializes a KmeansClustering with the +// provided matrix data, number of clusters k, and random number +// generator rg. +// +// data is a pointer to a Matrix of float64 representing the dataset, +// k is the number of desired clusters, and rg is a pointer to a +// random number generator used in the clustering process. +// Returns a pointer to the initialized KmeansClustering structure. func MakeKmeansClustering(data *obiutils.Matrix[float64], k int, rg *rand.Rand) *KmeansClustering { distmin := make([]float64, len(*data)) + classes := make([]int, len(*data)) for i := 0; i < len(distmin); i++ { distmin[i] = math.MaxFloat64 + classes[i] = -1 } clustering := &KmeansClustering{ data: data, + distmin: distmin, + classes: classes, + rg: rg, + centers: make(obiutils.Matrix[float64], 0, k), icenters: make([]int, 0, k), sizes: make([]int, 0, k), - centers: make(obiutils.Matrix[float64], 0, k), - distmin: distmin, - classes: make([]int, len(*data)), - rg: rg, } for i := 0; i < k; i++ { @@ -81,79 +113,160 @@ func (clustering *KmeansClustering) N() int { func (clustering *KmeansClustering) Dimension() int { return len((*clustering.data)[0]) } + +// SetCenterTo sets the center of a specific cluster to a given data point index. +// +// Parameters: +// - k: the index of the cluster, if k=-1, a new center is added +// - i: the index of the data point +// - reset: a boolean indicating whether to reset the distances to the nearest center +// for points previously assigned to this center +// +// No return value. + +func (clustering *KmeansClustering) SetCenterTo(k, i int, reset bool) { + N := clustering.N() + K := clustering.K() + center := (*clustering.data)[i] + + if k >= 0 { + clustering.icenters[k] = i + clustering.sizes[k] = 0 + clustering.centers[k] = center + + if reset { + // Recompute distances to the nearest center for points + // previously assigned to this center + K := clustering.K() + for j := 0; j < N; j++ { + if clustering.classes[j] == k { + clustering.distmin[j] = math.MaxFloat64 + for l := 1; l < K; l++ { + dist := EuclideanDist((*clustering.data)[j], clustering.centers[l]) + if dist < clustering.distmin[j] { + clustering.distmin[j] = dist + clustering.classes[j] = l + } + } + clustering.sizes[clustering.classes[j]]++ + } + } + } + + } else { + clustering.icenters = append(clustering.icenters, i) + clustering.sizes = append(clustering.sizes, 0) + clustering.centers = append(clustering.centers, center) + k = K + K++ + } + + for j := 0; j < clustering.N(); j++ { + dist := EuclideanDist((*clustering.data)[j], center) + if dist < clustering.distmin[j] { + if C := clustering.classes[j]; C >= 0 { + clustering.sizes[C]-- + } + clustering.distmin[j] = dist + clustering.classes[j] = k + clustering.sizes[k]++ + } + } + +} + +// AddACenter adds a new center to the KmeansClustering. +// +// If there are no centers, it randomly selects a new center. +// If there are existing centers, it selects a new center with +// probability proportional to its distance from the nearest +// center. The center is then added to the clustering. func (clustering *KmeansClustering) AddACenter() { + k := clustering.K() C := 0 - if clustering.K() == 0 { + + if k == 0 { + // if there are no centers yet, draw a sample as the first center C = rand.Intn(clustering.N()) } else { + // otherwise, draw a sample with a probability proportional + // to its closest distance to a center w := sampleuv.NewWeighted(clustering.distmin, clustering.rg) C, _ = w.Take() } - clustering.icenters = append(clustering.icenters, C) - clustering.sizes = append(clustering.sizes, 0) - center := (*clustering.data)[C] - clustering.centers = append(clustering.centers, center) - n := clustering.N() - - for i := 0; i < n; i++ { - d := squareDist((*clustering.data)[i], center) - if d < clustering.distmin[i] { - clustering.distmin[i] = d - } - } + clustering.SetCenterTo(-1, C, false) } -// ResetEmptyCenters resets the empty centers in the KmeansClustering struct. +// ResetEmptyCenters reinitializes any centers in a KmeansClustering +// that have no assigned points. // -// It iterates over the centers and checks if their corresponding sizes are zero. -// If a center is empty, a new weighted sample is taken with the help of the distmin and rg variables. -// The new center is then assigned to the empty center index, and the sizes and centers arrays are updated accordingly. -// Finally, the function returns the number of empty centers that were reset. +// This method iterates over the centers and uses a weighted sampling +// to reset centers with a size of zero. +// Returns the number of centers that were reset. func (clustering *KmeansClustering) ResetEmptyCenters() int { nreset := 0 for i := 0; i < clustering.K(); i++ { if clustering.sizes[i] == 0 { w := sampleuv.NewWeighted(clustering.distmin, clustering.rg) C, _ := w.Take() - clustering.icenters[i] = C - clustering.centers[i] = (*clustering.data)[C] + clustering.SetCenterTo(i, C, false) nreset++ } } return nreset } -// AssignToClass assigns each data point to a class based on the distance to the nearest center. +// ClosestPoint finds the index of the closest point in the +// clustering to the given coordinates. // -// This function does not take any parameters. -// It does not return anything. +// coordinates is a slice of float64 representing the point. +// Returns the index of the closest point as an int. +func (clustering *KmeansClustering) ClosestPoint(coordinates []float64) int { + N := clustering.N() + distmin := math.MaxFloat64 + C := -1 + for i := 0; i < N; i++ { + dist := EuclideanDist((*clustering.data)[i], coordinates) + if dist < distmin { + distmin = dist + C = i + } + } + return C +} + +// AssignToClass assigns each data point in the dataset to the nearest +// center (class) in a K-means clustering algorithm. +// +// Handles the reinitialization of empty centers after the assignment. +// No return values. func (clustering *KmeansClustering) AssignToClass() { var wg sync.WaitGroup var lock sync.Mutex + // initialize the number of points in each class for i := 0; i < clustering.K(); i++ { clustering.sizes[i] = 0 } - for i := 0; i < clustering.N(); i++ { - clustering.distmin[i] = math.MaxFloat64 - } goroutine := func(i int) { defer wg.Done() dmin := math.MaxFloat64 cmin := -1 for j, center := range clustering.centers { - dist := squareDist((*clustering.data)[i], center) + dist := EuclideanDist((*clustering.data)[i], center) if dist < dmin { dmin = dist cmin = j } } - lock.Lock() + clustering.classes[i] = cmin - clustering.sizes[cmin]++ clustering.distmin[i] = dmin + + lock.Lock() + clustering.sizes[cmin]++ lock.Unlock() } @@ -162,16 +275,45 @@ func (clustering *KmeansClustering) AssignToClass() { go goroutine(i) } + wg.Wait() + nreset := clustering.ResetEmptyCenters() if nreset > 0 { - log.Warnf("Reset %d empty centers", nreset) - clustering.AssignToClass() + log.Warnf("Reseted %d empty centers", nreset) } } +// SetCentersTo assigns new centers in the KmeansClustering +// structure given a slice of indices. +// +// The indices parameter is a slice of integers that +// corresponds to the new indices of the cluster centers in +// the dataset. It panics if any index is out of bounds. +// This method does not return any value. +func (clustering *KmeansClustering) SetCentersTo(indices []int) { + for _, v := range indices { + if v < 0 || v >= clustering.N() { + log.Fatalf("Invalid center index: %d", v) + } + } + + clustering.icenters = indices + K := len(indices) + + for i := 0; i < K; i++ { + clustering.centers[i] = (*clustering.data)[indices[i]] + } + + clustering.AssignToClass() + +} + // ComputeCenters calculates the centers of the K-means clustering algorithm. // +// This method call AssignToClass() after computing the centers to ensure coherence +// of the clustering data structure. +// // It takes no parameters. // It does not return any values. func (clustering *KmeansClustering) ComputeCenters() { @@ -179,66 +321,54 @@ func (clustering *KmeansClustering) ComputeCenters() { centers := clustering.centers data := clustering.data classes := clustering.classes - k := clustering.K() + K := clustering.K() - // Goroutine code - goroutine1 := func(centerIdx int) { + // compute the location of center of class centerIdx + // as the point in the data the closest to the + // center of class centerIdx + newCenter := func(centerIdx int) { defer wg.Done() - for j, row := range *data { - class := classes[j] - if class == centerIdx { - for l, val := range row { - centers[centerIdx][l] += val - } - } + + center := make([]float64, clustering.Dimension()) + + for j := range center { + center[j] = 0 } - } - for i := 0; i < k; i++ { - wg.Add(1) - go goroutine1(i) - } - - wg.Wait() - - for i := range centers { - for j := range centers[i] { - centers[i][j] /= float64(clustering.sizes[i]) - } - } - - goroutine2 := func(centerIdx int) { - defer wg.Done() - dkmin := math.MaxFloat64 - dki := -1 - center := centers[centerIdx] for j, row := range *data { if classes[j] == centerIdx { - dist := squareDist(row, center) - if dist < dkmin { - dkmin = dist - dki = j + for l, val := range row { + center[l] += val } } } - clustering.icenters[centerIdx] = dki - clustering.centers[centerIdx] = (*data)[dki] + + for j := range centers[centerIdx] { + center[j] /= float64(clustering.sizes[centerIdx]) + } + + C := clustering.ClosestPoint(center) + + centers[centerIdx] = (*data)[C] + clustering.icenters[centerIdx] = C } - for i := 0; i < k; i++ { + for i := 0; i < K; i++ { wg.Add(1) - go goroutine2(i) + go newCenter(i) } wg.Wait() + clustering.AssignToClass() + } func (clustering *KmeansClustering) Inertia() float64 { inertia := 0.0 for i := 0; i < clustering.N(); i++ { - inertia += clustering.distmin[i] + inertia += clustering.distmin[i] * clustering.distmin[i] } return inertia } @@ -264,7 +394,6 @@ func (clustering *KmeansClustering) Run(max_cycle int, threshold float64) bool { newI := clustering.Inertia() for i := 0; i < max_cycle && (prev-newI) > threshold; i++ { prev = newI - clustering.AssignToClass() clustering.ComputeCenters() newI = clustering.Inertia() } diff --git a/pkg/obitools/obilandmark/obilandmark.go b/pkg/obitools/obilandmark/obilandmark.go index 422a68d..c2ba2ab 100644 --- a/pkg/obitools/obilandmark/obilandmark.go +++ b/pkg/obitools/obilandmark/obilandmark.go @@ -109,43 +109,26 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque n_landmark := CLINCenter() landmark_idx := obistats.SampleIntWithoutReplacement(n_landmark, library_size) + sort.IntSlice(landmark_idx).Sort() log.Infof("Library contains %d sequence", len(library)) var seqworld obiutils.Matrix[float64] for loop := 0; loop < 2; loop++ { - sort.IntSlice(landmark_idx).Sort() log.Debugf("Selected indices : %v", landmark_idx) seqworld = MapOnLandmarkSequences(library, landmark_idx) - initialCenters := obiutils.Make2DArray[float64](n_landmark, n_landmark) - for i, seq_idx := range landmark_idx { - initialCenters[i] = seqworld[seq_idx] - } - // classes, centers := obistats.Kmeans(&seqworld, n_landmark, &initialCenters) classifier := obistats.MakeKmeansClustering(&seqworld, n_landmark, obistats.DefaultRG()) - _, centers, inertia, converged := classifier.Run(1000, 0.001) - intertia := classifier.Inertia() - _, centers, inertia, converged := obistats.Kmeans(&seqworld, n_landmark, 0.001, &initialCenters) + converged := classifier.Run(1000, 0.001) + inertia := classifier.Inertia() - dist_centers := 0.0 - for i := 0; i < n_landmark; i++ { - center := (*centers)[i] - icenter := initialCenters[i] - for j := 0; j < n_landmark; j++ { - diff := center[j] - icenter[j] - dist_centers += diff * diff - } - } - - landmark_idx = obistats.KmeansBestRepresentative(&seqworld, centers) - log.Infof("Inertia: %f, Dist centers: %f, converged: %t", inertia, dist_centers, converged) + log.Infof("Inertia: %f, converged: %t", inertia, converged) + landmark_idx = classifier.CentersIndices() + sort.IntSlice(landmark_idx).Sort() } - sort.IntSlice(landmark_idx).Sort() - log.Debugf("Selected indices : %v", landmark_idx) seqworld = MapOnLandmarkSequences(library, landmark_idx) @@ -159,12 +142,14 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque initialCenters[i] = seqworld[seq_idx] } - classes := obistats.AssignToClass(&seqworld, &initialCenters) + // classes := obistats.AssignToClass(&seqworld, &initialCenters) for i, seq := range library { ic, _ := obiutils.InterfaceToIntSlice(seqworld[i]) seq.SetCoordinate(ic) - seq.SetAttribute("landmark_class", classes[i]) + // seq.SetAttribute("landmark_class", classes[i]) + + // if the sequence is a landmark sequence if i, ok := seq_landmark[i]; ok { seq.SetAttribute("landmark_id", i) } diff --git a/pkg/obitools/obirefidx/geomindexing.go b/pkg/obitools/obirefidx/geomindexing.go index a77af7b..06e7dd0 100644 --- a/pkg/obitools/obirefidx/geomindexing.go +++ b/pkg/obitools/obirefidx/geomindexing.go @@ -7,6 +7,7 @@ import ( "sync" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) @@ -35,12 +36,8 @@ func GeomIndexSesquence(seqidx int, if reflocation == nil { log.Fatalf("Sequence %s does not have a coordinate", ref.Id()) } - d := 0.0 - for i, x := range location { - diff := float64(x - reflocation[i]) - d += diff * diff - } - seq_dist[i] = d + + seq_dist[i] = obistats.SquareDist(location, reflocation) }(i, ref) } @@ -51,18 +48,16 @@ func GeomIndexSesquence(seqidx int, lca := (*taxa)[seqidx] index := make(map[int]string) - index[0.0] = fmt.Sprintf( + index[0] = fmt.Sprintf( "%d@%s@%s", lca.Taxid(), lca.ScientificName(), lca.Rank()) - old_dist := 0.0 for _, o := range order { new_lca, _ := lca.LCA((*taxa)[o]) - if new_lca.Taxid() != lca.Taxid() || seq_dist[o] != old_dist { + if new_lca.Taxid() != lca.Taxid() { lca = new_lca - old_dist = seq_dist[o] index[int(seq_dist[o])] = fmt.Sprintf( "%d@%s@%s", lca.Taxid(), diff --git a/pkg/obitools/obitag/obigeomtag.go b/pkg/obitools/obitag/obigeomtag.go index 8645ee5..5fec126 100644 --- a/pkg/obitools/obitag/obigeomtag.go +++ b/pkg/obitools/obitag/obigeomtag.go @@ -8,6 +8,7 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" ) @@ -74,7 +75,7 @@ func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioS coords := make([]int, len(*landmarks)) for i, l := range *landmarks { - lcs, length := obialign.FastLCSEGFScore(sequence, l, -1, buffer) + lcs, length := obialign.FastLCSScore(sequence, l, -1, buffer) coords[i] = length - lcs } @@ -98,9 +99,9 @@ func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioS func FindGeomClosest(sequence *obiseq.BioSequence, landmarks *obiseq.BioSequenceSlice, references *obiseq.BioSequenceSlice, - buffer *[]uint64) (*obiseq.BioSequence, int, float64, []int, *obiseq.BioSequenceSlice) { + buffer *[]uint64) (*obiseq.BioSequence, float64, float64, []int, *obiseq.BioSequenceSlice) { - min_dist := math.MaxInt64 + min_dist := math.MaxFloat64 min_idx := make([]int, 0) query_location := MapOnLandmarkSequences(sequence, landmarks, buffer) @@ -110,11 +111,8 @@ func FindGeomClosest(sequence *obiseq.BioSequence, if len(coord) == 0 { log.Panicf("Empty coordinate for reference sequence %s", l.Id()) } - dist := 0 - for j := 0; j < len(coord); j++ { - diff := query_location[j] - coord[j] - dist += diff * diff - } + + dist := obistats.SquareDist(coord, query_location) if dist == min_dist { min_idx = append(min_idx, i) diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go index d85fbdb..6cac5f3 100644 --- a/pkg/obitools/obitag/obitag.go +++ b/pkg/obitools/obitag/obitag.go @@ -6,6 +6,8 @@ import ( "strings" log "github.com/sirupsen/logrus" + "golang.org/x/exp/maps" + "golang.org/x/exp/slices" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" @@ -27,22 +29,20 @@ import ( // - taxid: The taxid associated with the matched distance. // - rank: The rank associated with the matched distance. // - scientificName: The scientific name associated with the matched distance. -func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) { - keys := make([]int, 0, len(distanceIdx)) - for k := range distanceIdx { - keys = append(keys, k) - } - sort.Ints(keys) +func MatchDistanceIndex(distance float64, distanceIdx map[int]string) (int, string, string) { + idist := int(distance) + keys := maps.Keys(distanceIdx) + slices.Sort(keys) i := sort.Search(len(keys), func(i int) bool { - return distance <= keys[i] + return idist <= keys[i] }) var taxid int var rank string var scientificName string - if i == len(keys) || distance > keys[len(keys)-1] { + if i == len(keys) || idist > keys[len(keys)-1] { taxid = 1 rank = "no rank" scientificName = "root" diff --git a/pkg/obiutils/ranks.go b/pkg/obiutils/ranks.go index 7e2ed66..db86688 100644 --- a/pkg/obiutils/ranks.go +++ b/pkg/obiutils/ranks.go @@ -15,6 +15,12 @@ func (r intRanker) Len() int { return len(r.x) } func (r intRanker) Less(i, j int) bool { return r.x[r.r[i]] < r.x[r.r[j]] } func (r intRanker) Swap(i, j int) { r.r[i], r.r[j] = r.r[j], r.r[i] } +// IntOrder sorts a slice of integers and returns a slice +// of indices that represents the order of the sorted +// elements. +// +// `data` is a slice of integers to be ordered. +// Returns a slice of the ordered indices. func IntOrder(data []int) []int { if len(data) == 0 { return nil