Implements the kmeans++ algo to select the landmarks in the geometric method

Former-commit-id: 732404a0dc6d7276e4e479dd2481aa4bd42d4ce5
2025-06-29 16:20:46 +00:00 · 2023-12-11 16:07:03 +01:00
parent 37c3e16d5d
commit 2caaa62485
8 changed files with 259 additions and 140 deletions
--- a/Release-notes.md
+++ b/Release-notes.md
@ -21,7 +21,11 @@

 ### Enhancement

-   For efficiency purposes, now the `obiuniq` command run on disk by default. Consequently, the **--on-disk** option has been replaced by **--in-memory** to ask explicitly to use memory.
+-   For efficiency purposes, now the `obiuniq` command run on disk by default. Consequently, the **--on-disk** 
+    option has been replaced by **--in-memory** to ask explicitly to use memory.
+-   Adds an option **--penalty-scale** to the `obipairing` and `obipcrtag` command to fine tune the pairing score 
+    in the system of the alignment procedure by applying a scaling factor to the mismatch score and the gap score
+    relatively to the match score.

 ### Bug fixes

--- a/pkg/obiseq/biosequenceslice.go
+++ b/pkg/obiseq/biosequenceslice.go
@ -31,7 +31,7 @@ func NewBioSequenceSlice(size ...int) *BioSequenceSlice {
 	slice := _BioSequenceSlicePool.Get().(*BioSequenceSlice)
 	if len(size) > 0 {
 		s := size[0]
-		slice = slice.InsureCapacity(s)
+		slice = slice.EnsureCapacity(s)
 		(*slice) = (*slice)[0:s]
 	}
 	return slice
@ -76,11 +76,11 @@ func (s *BioSequenceSlice) Recycle(including_seq bool) {
 	_BioSequenceSlicePool.Put(s)
 }

-// InsureCapacity ensures that the BioSequenceSlice has a minimum capacity
+// EnsureCapacity ensures that the BioSequenceSlice has a minimum capacity
 //
 // It takes an integer `capacity` as a parameter, which represents the desired minimum capacity of the BioSequenceSlice.
 // It returns a pointer to the BioSequenceSlice.
-func (s *BioSequenceSlice) InsureCapacity(capacity int) *BioSequenceSlice {
+func (s *BioSequenceSlice) EnsureCapacity(capacity int) *BioSequenceSlice {
 	var c int
 	if s != nil {
 		c = cap(*s)
@ -88,7 +88,9 @@ func (s *BioSequenceSlice) InsureCapacity(capacity int) *BioSequenceSlice {
 		c = 0
 	}

-	*s = slices.Grow[BioSequenceSlice](*s, capacity-c)
+	if capacity > c {
+		*s = slices.Grow[BioSequenceSlice](*s, capacity-c)
+	}

 	return s
 }
--- a/pkg/obistats/kmeans.go
+++ b/pkg/obistats/kmeans.go
@ -12,43 +12,75 @@ import (
 	log "github.com/sirupsen/logrus"
 )

-func squareDist(a, b []float64) float64 {
+// SquareDist calculates the squared Euclidean distance between
+// two vectors 'a' and 'b'.
+//
+// 'a' and 'b' are slices of float64 values representing
+// coordinate points in space. It is assumed that both slices
+// have the same length.
+// Returns the calculated squared distance as a float64.
+func SquareDist[T float64 | int](a, b []T) float64 {
 	sum := 0.0
-	for i := 0; i < len(a); i++ {
-		diff := a[i] - b[i]
+	for i, v := range a {
+		diff := float64(v - b[i])
 		sum += diff * diff
 	}
 	return sum
 }

+// EuclideanDist calculates the Euclidean distance between
+// two vectors represented as slices of float64.
+//
+// `a` and `b` are slices of float64 where each element of `a`
+// is paired with the corresponding element of `b`.
+// Returns the squared sum of the differences.
+func EuclideanDist[T float64 | int](a, b []T) float64 {
+	return math.Sqrt(SquareDist(a, b))
+}
+
+// DefaultRG creates and returns a new instance of *rand.Rand.
+//
+// No parameters.
+// Returns *rand.Rand which is a pointer to a new random number
+// generator, seeded with the current time in nanoseconds.
 func DefaultRG() *rand.Rand {
 	return rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
 }

 type KmeansClustering struct {
-	data     *obiutils.Matrix[float64]
-	rg       *rand.Rand
-	centers  obiutils.Matrix[float64]
-	icenters []int
-	sizes    []int
-	distmin  []float64
-	classes  []int
+	data     *obiutils.Matrix[float64] // data matrix    						dimensions: n x p
+	distmin  []float64                 // distance to closest center			dimension: n
+	classes  []int                     // class of each data point				dimension: n
+	rg       *rand.Rand                // random number generator
+	centers  obiutils.Matrix[float64]  // centers coordinates 					dimensions: k x p
+	icenters []int                     // indices of centers					dimension: k
+	sizes    []int                     // number of elements in each cluster	dimension: k
 }

+// MakeKmeansClustering initializes a KmeansClustering with the
+// provided matrix data, number of clusters k, and random number
+// generator rg.
+//
+// data is a pointer to a Matrix of float64 representing the dataset,
+// k is the number of desired clusters, and rg is a pointer to a
+// random number generator used in the clustering process.
+// Returns a pointer to the initialized KmeansClustering structure.
 func MakeKmeansClustering(data *obiutils.Matrix[float64], k int, rg *rand.Rand) *KmeansClustering {
 	distmin := make([]float64, len(*data))
+	classes := make([]int, len(*data))
 	for i := 0; i < len(distmin); i++ {
 		distmin[i] = math.MaxFloat64
+		classes[i] = -1
 	}

 	clustering := &KmeansClustering{
 		data:     data,
+		distmin:  distmin,
+		classes:  classes,
+		rg:       rg,
+		centers:  make(obiutils.Matrix[float64], 0, k),
 		icenters: make([]int, 0, k),
 		sizes:    make([]int, 0, k),
-		centers:  make(obiutils.Matrix[float64], 0, k),
-		distmin:  distmin,
-		classes:  make([]int, len(*data)),
-		rg:       rg,
 	}

 	for i := 0; i < k; i++ {
@ -81,79 +113,160 @@ func (clustering *KmeansClustering) N() int {
 func (clustering *KmeansClustering) Dimension() int {
 	return len((*clustering.data)[0])
 }
+
+// SetCenterTo sets the center of a specific cluster to a given data point index.
+//
+// Parameters:
+// - k: the index of the cluster, if k=-1, a new center is added
+// - i: the index of the data point
+// - reset: a boolean indicating whether to reset the distances to the nearest center
+//          for points previously assigned to this center
+//
+// No return value.
+
+func (clustering *KmeansClustering) SetCenterTo(k, i int, reset bool) {
+	N := clustering.N()
+	K := clustering.K()
+	center := (*clustering.data)[i]
+
+	if k >= 0 {
+		clustering.icenters[k] = i
+		clustering.sizes[k] = 0
+		clustering.centers[k] = center
+
+		if reset {
+			// Recompute distances to the nearest center for points
+			// previously assigned to this center
+			K := clustering.K()
+			for j := 0; j < N; j++ {
+				if clustering.classes[j] == k {
+					clustering.distmin[j] = math.MaxFloat64
+					for l := 1; l < K; l++ {
+						dist := EuclideanDist((*clustering.data)[j], clustering.centers[l])
+						if dist < clustering.distmin[j] {
+							clustering.distmin[j] = dist
+							clustering.classes[j] = l
+						}
+					}
+					clustering.sizes[clustering.classes[j]]++
+				}
+			}
+		}
+
+	} else {
+		clustering.icenters = append(clustering.icenters, i)
+		clustering.sizes = append(clustering.sizes, 0)
+		clustering.centers = append(clustering.centers, center)
+		k = K
+		K++
+	}
+
+	for j := 0; j < clustering.N(); j++ {
+		dist := EuclideanDist((*clustering.data)[j], center)
+		if dist < clustering.distmin[j] {
+			if C := clustering.classes[j]; C >= 0 {
+				clustering.sizes[C]--
+			}
+			clustering.distmin[j] = dist
+			clustering.classes[j] = k
+			clustering.sizes[k]++
+		}
+	}
+
+}
+
+// AddACenter adds a new center to the KmeansClustering.
+//
+// If there are no centers, it randomly selects a new center.
+// If there are existing centers, it selects a new center with
+// probability proportional to its distance from the nearest
+// center. The center is then added to the clustering.
 func (clustering *KmeansClustering) AddACenter() {
+	k := clustering.K()
 	C := 0
-	if clustering.K() == 0 {
+
+	if k == 0 {
+		// if there are no centers yet, draw a sample as the first center
 		C = rand.Intn(clustering.N())
 	} else {
+		// otherwise, draw a sample with a probability proportional
+		// to its closest distance to a center
 		w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
 		C, _ = w.Take()
 	}
-	clustering.icenters = append(clustering.icenters, C)
-	clustering.sizes = append(clustering.sizes, 0)
-	center := (*clustering.data)[C]
-	clustering.centers = append(clustering.centers, center)

-	n := clustering.N()
-
-	for i := 0; i < n; i++ {
-		d := squareDist((*clustering.data)[i], center)
-		if d < clustering.distmin[i] {
-			clustering.distmin[i] = d
-		}
-	}
+	clustering.SetCenterTo(-1, C, false)
 }

-// ResetEmptyCenters resets the empty centers in the KmeansClustering struct.
+// ResetEmptyCenters reinitializes any centers in a KmeansClustering
+// that have no assigned points.
 //
-// It iterates over the centers and checks if their corresponding sizes are zero.
-// If a center is empty, a new weighted sample is taken with the help of the distmin and rg variables.
-// The new center is then assigned to the empty center index, and the sizes and centers arrays are updated accordingly.
-// Finally, the function returns the number of empty centers that were reset.
+// This method iterates over the centers and uses a weighted sampling
+// to reset centers with a size of zero.
+// Returns the number of centers that were reset.
 func (clustering *KmeansClustering) ResetEmptyCenters() int {
 	nreset := 0
 	for i := 0; i < clustering.K(); i++ {
 		if clustering.sizes[i] == 0 {
 			w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
 			C, _ := w.Take()
-			clustering.icenters[i] = C
-			clustering.centers[i] = (*clustering.data)[C]
+			clustering.SetCenterTo(i, C, false)
 			nreset++
 		}
 	}
 	return nreset
 }

-// AssignToClass assigns each data point to a class based on the distance to the nearest center.
+// ClosestPoint finds the index of the closest point in the
+// clustering to the given coordinates.
 //
-// This function does not take any parameters.
-// It does not return anything.
+// coordinates is a slice of float64 representing the point.
+// Returns the index of the closest point as an int.
+func (clustering *KmeansClustering) ClosestPoint(coordinates []float64) int {
+	N := clustering.N()
+	distmin := math.MaxFloat64
+	C := -1
+	for i := 0; i < N; i++ {
+		dist := EuclideanDist((*clustering.data)[i], coordinates)
+		if dist < distmin {
+			distmin = dist
+			C = i
+		}
+	}
+	return C
+}
+
+// AssignToClass assigns each data point in the dataset to the nearest
+// center (class) in a K-means clustering algorithm.
+//
+// Handles the reinitialization of empty centers after the assignment.
+// No return values.
 func (clustering *KmeansClustering) AssignToClass() {
 	var wg sync.WaitGroup
 	var lock sync.Mutex

+	// initialize the number of points in each class
 	for i := 0; i < clustering.K(); i++ {
 		clustering.sizes[i] = 0
 	}
-	for i := 0; i < clustering.N(); i++ {
-		clustering.distmin[i] = math.MaxFloat64
-	}

 	goroutine := func(i int) {
 		defer wg.Done()
 		dmin := math.MaxFloat64
 		cmin := -1
 		for j, center := range clustering.centers {
-			dist := squareDist((*clustering.data)[i], center)
+			dist := EuclideanDist((*clustering.data)[i], center)
 			if dist < dmin {
 				dmin = dist
 				cmin = j
 			}
 		}
-		lock.Lock()
+
 		clustering.classes[i] = cmin
-		clustering.sizes[cmin]++
 		clustering.distmin[i] = dmin
+
+		lock.Lock()
+		clustering.sizes[cmin]++
 		lock.Unlock()
 	}

@ -162,16 +275,45 @@ func (clustering *KmeansClustering) AssignToClass() {
 		go goroutine(i)
 	}

+	wg.Wait()
+
 	nreset := clustering.ResetEmptyCenters()

 	if nreset > 0 {
-		log.Warnf("Reset %d empty centers", nreset)
-		clustering.AssignToClass()
+		log.Warnf("Reseted %d empty centers", nreset)
 	}
 }

+// SetCentersTo assigns new centers in the KmeansClustering
+// structure given a slice of indices.
+//
+// The indices parameter is a slice of integers that
+// corresponds to the new indices of the cluster centers in
+// the dataset. It panics if any index is out of bounds.
+// This method does not return any value.
+func (clustering *KmeansClustering) SetCentersTo(indices []int) {
+	for _, v := range indices {
+		if v < 0 || v >= clustering.N() {
+			log.Fatalf("Invalid center index: %d", v)
+		}
+	}
+
+	clustering.icenters = indices
+	K := len(indices)
+
+	for i := 0; i < K; i++ {
+		clustering.centers[i] = (*clustering.data)[indices[i]]
+	}
+
+	clustering.AssignToClass()
+
+}
+
 // ComputeCenters calculates the centers of the K-means clustering algorithm.
 //
+// This method call AssignToClass() after computing the centers to ensure coherence
+// of the clustering data structure.
+//
 // It takes no parameters.
 // It does not return any values.
 func (clustering *KmeansClustering) ComputeCenters() {
@ -179,66 +321,54 @@ func (clustering *KmeansClustering) ComputeCenters() {
 	centers := clustering.centers
 	data := clustering.data
 	classes := clustering.classes
-	k := clustering.K()
+	K := clustering.K()

-	// Goroutine code
-	goroutine1 := func(centerIdx int) {
+	// compute the location of center of class centerIdx
+	// as the point in the data the closest to the
+	// center of class centerIdx
+	newCenter := func(centerIdx int) {
 		defer wg.Done()
-		for j, row := range *data {
-			class := classes[j]
-			if class == centerIdx {
-				for l, val := range row {
-					centers[centerIdx][l] += val
-				}
-			}
+
+		center := make([]float64, clustering.Dimension())
+
+		for j := range center {
+			center[j] = 0
 		}
-	}

-	for i := 0; i < k; i++ {
-		wg.Add(1)
-		go goroutine1(i)
-	}
-
-	wg.Wait()
-
-	for i := range centers {
-		for j := range centers[i] {
-			centers[i][j] /= float64(clustering.sizes[i])
-		}
-	}
-
-	goroutine2 := func(centerIdx int) {
-		defer wg.Done()
-		dkmin := math.MaxFloat64
-		dki := -1
-		center := centers[centerIdx]
 		for j, row := range *data {
 			if classes[j] == centerIdx {
-				dist := squareDist(row, center)
-				if dist < dkmin {
-					dkmin = dist
-					dki = j
+				for l, val := range row {
+					center[l] += val
 				}
 			}
 		}
-		clustering.icenters[centerIdx] = dki
-		clustering.centers[centerIdx] = (*data)[dki]
+
+		for j := range centers[centerIdx] {
+			center[j] /= float64(clustering.sizes[centerIdx])
+		}
+
+		C := clustering.ClosestPoint(center)
+
+		centers[centerIdx] = (*data)[C]
+		clustering.icenters[centerIdx] = C
 	}

-	for i := 0; i < k; i++ {
+	for i := 0; i < K; i++ {
 		wg.Add(1)
-		go goroutine2(i)
+		go newCenter(i)
 	}

 	wg.Wait()

+	clustering.AssignToClass()
+
 }

 func (clustering *KmeansClustering) Inertia() float64 {
 	inertia := 0.0

 	for i := 0; i < clustering.N(); i++ {
-		inertia += clustering.distmin[i]
+		inertia += clustering.distmin[i] * clustering.distmin[i]
 	}
 	return inertia
 }
@ -264,7 +394,6 @@ func (clustering *KmeansClustering) Run(max_cycle int, threshold float64) bool {
 	newI := clustering.Inertia()
 	for i := 0; i < max_cycle && (prev-newI) > threshold; i++ {
 		prev = newI
-		clustering.AssignToClass()
 		clustering.ComputeCenters()
 		newI = clustering.Inertia()
 	}
--- a/pkg/obitools/obilandmark/obilandmark.go
+++ b/pkg/obitools/obilandmark/obilandmark.go
@ -109,43 +109,26 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
 	n_landmark := CLINCenter()

 	landmark_idx := obistats.SampleIntWithoutReplacement(n_landmark, library_size)
+	sort.IntSlice(landmark_idx).Sort()
 	log.Infof("Library contains %d sequence", len(library))

 	var seqworld obiutils.Matrix[float64]

 	for loop := 0; loop < 2; loop++ {
-		sort.IntSlice(landmark_idx).Sort()
 		log.Debugf("Selected indices : %v", landmark_idx)

 		seqworld = MapOnLandmarkSequences(library, landmark_idx)
-		initialCenters := obiutils.Make2DArray[float64](n_landmark, n_landmark)
-		for i, seq_idx := range landmark_idx {
-			initialCenters[i] = seqworld[seq_idx]
-		}

-		// classes, centers := obistats.Kmeans(&seqworld, n_landmark, &initialCenters)
 		classifier := obistats.MakeKmeansClustering(&seqworld, n_landmark, obistats.DefaultRG())
-		_, centers, inertia, converged := classifier.Run(1000, 0.001)
-		intertia := classifier.Inertia()
-		_, centers, inertia, converged := obistats.Kmeans(&seqworld, n_landmark, 0.001, &initialCenters)
+		converged := classifier.Run(1000, 0.001)
+		inertia := classifier.Inertia()

-		dist_centers := 0.0
-		for i := 0; i < n_landmark; i++ {
-			center := (*centers)[i]
-			icenter := initialCenters[i]
-			for j := 0; j < n_landmark; j++ {
-				diff := center[j] - icenter[j]
-				dist_centers += diff * diff
-			}
-		}
-
-		landmark_idx = obistats.KmeansBestRepresentative(&seqworld, centers)
-		log.Infof("Inertia: %f, Dist centers: %f, converged: %t", inertia, dist_centers, converged)
+		log.Infof("Inertia: %f, converged: %t", inertia, converged)

+		landmark_idx = classifier.CentersIndices()
+		sort.IntSlice(landmark_idx).Sort()
 	}

-	sort.IntSlice(landmark_idx).Sort()
-
 	log.Debugf("Selected indices : %v", landmark_idx)
 	seqworld = MapOnLandmarkSequences(library, landmark_idx)

@ -159,12 +142,14 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
 		initialCenters[i] = seqworld[seq_idx]
 	}

-	classes := obistats.AssignToClass(&seqworld, &initialCenters)
+	// classes := obistats.AssignToClass(&seqworld, &initialCenters)

 	for i, seq := range library {
 		ic, _ := obiutils.InterfaceToIntSlice(seqworld[i])
 		seq.SetCoordinate(ic)
-		seq.SetAttribute("landmark_class", classes[i])
+		// seq.SetAttribute("landmark_class", classes[i])
+
+		// if the sequence is a landmark sequence
 		if i, ok := seq_landmark[i]; ok {
 			seq.SetAttribute("landmark_id", i)
 		}
--- a/pkg/obitools/obirefidx/geomindexing.go
+++ b/pkg/obitools/obirefidx/geomindexing.go
@ -7,6 +7,7 @@ import (
 	"sync"

 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 )
@ -35,12 +36,8 @@ func GeomIndexSesquence(seqidx int,
 			if reflocation == nil {
 				log.Fatalf("Sequence %s does not have a coordinate", ref.Id())
 			}
-			d := 0.0
-			for i, x := range location {
-				diff := float64(x - reflocation[i])
-				d += diff * diff
-			}
-			seq_dist[i] = d
+
+			seq_dist[i] = obistats.SquareDist(location, reflocation)
 		}(i, ref)
 	}

@ -51,18 +48,16 @@ func GeomIndexSesquence(seqidx int,
 	lca := (*taxa)[seqidx]

 	index := make(map[int]string)
-	index[0.0] = fmt.Sprintf(
+	index[0] = fmt.Sprintf(
 		"%d@%s@%s",
 		lca.Taxid(),
 		lca.ScientificName(),
 		lca.Rank())

-	old_dist := 0.0
 	for _, o := range order {
 		new_lca, _ := lca.LCA((*taxa)[o])
-		if new_lca.Taxid() != lca.Taxid() || seq_dist[o] != old_dist {
+		if new_lca.Taxid() != lca.Taxid() {
 			lca = new_lca
-			old_dist = seq_dist[o]
 			index[int(seq_dist[o])] = fmt.Sprintf(
 				"%d@%s@%s",
 				lca.Taxid(),
--- a/pkg/obitools/obitag/obigeomtag.go
+++ b/pkg/obitools/obitag/obigeomtag.go
@ -8,6 +8,7 @@ import (
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
 )

@ -74,7 +75,7 @@ func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioS
 	coords := make([]int, len(*landmarks))

 	for i, l := range *landmarks {
-		lcs, length := obialign.FastLCSEGFScore(sequence, l, -1, buffer)
+		lcs, length := obialign.FastLCSScore(sequence, l, -1, buffer)
 		coords[i] = length - lcs
 	}

@ -98,9 +99,9 @@ func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioS
 func FindGeomClosest(sequence *obiseq.BioSequence,
 	landmarks *obiseq.BioSequenceSlice,
 	references *obiseq.BioSequenceSlice,
-	buffer *[]uint64) (*obiseq.BioSequence, int, float64, []int, *obiseq.BioSequenceSlice) {
+	buffer *[]uint64) (*obiseq.BioSequence, float64, float64, []int, *obiseq.BioSequenceSlice) {

-	min_dist := math.MaxInt64
+	min_dist := math.MaxFloat64
 	min_idx := make([]int, 0)

 	query_location := MapOnLandmarkSequences(sequence, landmarks, buffer)
@ -110,11 +111,8 @@ func FindGeomClosest(sequence *obiseq.BioSequence,
 		if len(coord) == 0 {
 			log.Panicf("Empty coordinate for reference sequence %s", l.Id())
 		}
-		dist := 0
-		for j := 0; j < len(coord); j++ {
-			diff := query_location[j] - coord[j]
-			dist += diff * diff
-		}
+
+		dist := obistats.SquareDist(coord, query_location)

 		if dist == min_dist {
 			min_idx = append(min_idx, i)
--- a/pkg/obitools/obitag/obitag.go
+++ b/pkg/obitools/obitag/obitag.go
@ -6,6 +6,8 @@ import (
 	"strings"

 	log "github.com/sirupsen/logrus"
+	"golang.org/x/exp/maps"
+	"golang.org/x/exp/slices"

 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
@ -27,22 +29,20 @@ import (
 // - taxid: The taxid associated with the matched distance.
 // - rank: The rank associated with the matched distance.
 // - scientificName: The scientific name associated with the matched distance.
-func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) {
-	keys := make([]int, 0, len(distanceIdx))
-	for k := range distanceIdx {
-		keys = append(keys, k)
-	}
-	sort.Ints(keys)
+func MatchDistanceIndex(distance float64, distanceIdx map[int]string) (int, string, string) {
+	idist := int(distance)
+	keys := maps.Keys(distanceIdx)
+	slices.Sort(keys)

 	i := sort.Search(len(keys), func(i int) bool {
-		return distance <= keys[i]
+		return idist <= keys[i]
 	})

 	var taxid int
 	var rank string
 	var scientificName string

-	if i == len(keys) || distance > keys[len(keys)-1] {
+	if i == len(keys) || idist > keys[len(keys)-1] {
 		taxid = 1
 		rank = "no rank"
 		scientificName = "root"
--- a/pkg/obiutils/ranks.go
+++ b/pkg/obiutils/ranks.go
@ -15,6 +15,12 @@ func (r intRanker) Len() int           { return len(r.x) }
 func (r intRanker) Less(i, j int) bool { return r.x[r.r[i]] < r.x[r.r[j]] }
 func (r intRanker) Swap(i, j int)      { r.r[i], r.r[j] = r.r[j], r.r[i] }

+// IntOrder sorts a slice of integers and returns a slice
+// of indices that represents the order of the sorted
+// elements.
+//
+// `data` is a slice of integers to be ordered.
+// Returns a slice of the ordered indices.
 func IntOrder(data []int) []int {
 	if len(data) == 0 {
 		return nil