mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Implements the kmeans++ algo to select the landmarks in the geometric method
Former-commit-id: 732404a0dc6d7276e4e479dd2481aa4bd42d4ce5
This commit is contained in:
@ -21,7 +21,11 @@
|
||||
|
||||
### Enhancement
|
||||
|
||||
- For efficiency purposes, now the `obiuniq` command run on disk by default. Consequently, the **--on-disk** option has been replaced by **--in-memory** to ask explicitly to use memory.
|
||||
- For efficiency purposes, now the `obiuniq` command run on disk by default. Consequently, the **--on-disk**
|
||||
option has been replaced by **--in-memory** to ask explicitly to use memory.
|
||||
- Adds an option **--penalty-scale** to the `obipairing` and `obipcrtag` command to fine tune the pairing score
|
||||
in the system of the alignment procedure by applying a scaling factor to the mismatch score and the gap score
|
||||
relatively to the match score.
|
||||
|
||||
### Bug fixes
|
||||
|
||||
|
@ -31,7 +31,7 @@ func NewBioSequenceSlice(size ...int) *BioSequenceSlice {
|
||||
slice := _BioSequenceSlicePool.Get().(*BioSequenceSlice)
|
||||
if len(size) > 0 {
|
||||
s := size[0]
|
||||
slice = slice.InsureCapacity(s)
|
||||
slice = slice.EnsureCapacity(s)
|
||||
(*slice) = (*slice)[0:s]
|
||||
}
|
||||
return slice
|
||||
@ -76,11 +76,11 @@ func (s *BioSequenceSlice) Recycle(including_seq bool) {
|
||||
_BioSequenceSlicePool.Put(s)
|
||||
}
|
||||
|
||||
// InsureCapacity ensures that the BioSequenceSlice has a minimum capacity
|
||||
// EnsureCapacity ensures that the BioSequenceSlice has a minimum capacity
|
||||
//
|
||||
// It takes an integer `capacity` as a parameter, which represents the desired minimum capacity of the BioSequenceSlice.
|
||||
// It returns a pointer to the BioSequenceSlice.
|
||||
func (s *BioSequenceSlice) InsureCapacity(capacity int) *BioSequenceSlice {
|
||||
func (s *BioSequenceSlice) EnsureCapacity(capacity int) *BioSequenceSlice {
|
||||
var c int
|
||||
if s != nil {
|
||||
c = cap(*s)
|
||||
@ -88,7 +88,9 @@ func (s *BioSequenceSlice) InsureCapacity(capacity int) *BioSequenceSlice {
|
||||
c = 0
|
||||
}
|
||||
|
||||
*s = slices.Grow[BioSequenceSlice](*s, capacity-c)
|
||||
if capacity > c {
|
||||
*s = slices.Grow[BioSequenceSlice](*s, capacity-c)
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
@ -12,43 +12,75 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func squareDist(a, b []float64) float64 {
|
||||
// SquareDist calculates the squared Euclidean distance between
|
||||
// two vectors 'a' and 'b'.
|
||||
//
|
||||
// 'a' and 'b' are slices of float64 values representing
|
||||
// coordinate points in space. It is assumed that both slices
|
||||
// have the same length.
|
||||
// Returns the calculated squared distance as a float64.
|
||||
func SquareDist[T float64 | int](a, b []T) float64 {
|
||||
sum := 0.0
|
||||
for i := 0; i < len(a); i++ {
|
||||
diff := a[i] - b[i]
|
||||
for i, v := range a {
|
||||
diff := float64(v - b[i])
|
||||
sum += diff * diff
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
// EuclideanDist calculates the Euclidean distance between
|
||||
// two vectors represented as slices of float64.
|
||||
//
|
||||
// `a` and `b` are slices of float64 where each element of `a`
|
||||
// is paired with the corresponding element of `b`.
|
||||
// Returns the squared sum of the differences.
|
||||
func EuclideanDist[T float64 | int](a, b []T) float64 {
|
||||
return math.Sqrt(SquareDist(a, b))
|
||||
}
|
||||
|
||||
// DefaultRG creates and returns a new instance of *rand.Rand.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns *rand.Rand which is a pointer to a new random number
|
||||
// generator, seeded with the current time in nanoseconds.
|
||||
func DefaultRG() *rand.Rand {
|
||||
return rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
|
||||
}
|
||||
|
||||
type KmeansClustering struct {
|
||||
data *obiutils.Matrix[float64]
|
||||
rg *rand.Rand
|
||||
centers obiutils.Matrix[float64]
|
||||
icenters []int
|
||||
sizes []int
|
||||
distmin []float64
|
||||
classes []int
|
||||
data *obiutils.Matrix[float64] // data matrix dimensions: n x p
|
||||
distmin []float64 // distance to closest center dimension: n
|
||||
classes []int // class of each data point dimension: n
|
||||
rg *rand.Rand // random number generator
|
||||
centers obiutils.Matrix[float64] // centers coordinates dimensions: k x p
|
||||
icenters []int // indices of centers dimension: k
|
||||
sizes []int // number of elements in each cluster dimension: k
|
||||
}
|
||||
|
||||
// MakeKmeansClustering initializes a KmeansClustering with the
|
||||
// provided matrix data, number of clusters k, and random number
|
||||
// generator rg.
|
||||
//
|
||||
// data is a pointer to a Matrix of float64 representing the dataset,
|
||||
// k is the number of desired clusters, and rg is a pointer to a
|
||||
// random number generator used in the clustering process.
|
||||
// Returns a pointer to the initialized KmeansClustering structure.
|
||||
func MakeKmeansClustering(data *obiutils.Matrix[float64], k int, rg *rand.Rand) *KmeansClustering {
|
||||
distmin := make([]float64, len(*data))
|
||||
classes := make([]int, len(*data))
|
||||
for i := 0; i < len(distmin); i++ {
|
||||
distmin[i] = math.MaxFloat64
|
||||
classes[i] = -1
|
||||
}
|
||||
|
||||
clustering := &KmeansClustering{
|
||||
data: data,
|
||||
distmin: distmin,
|
||||
classes: classes,
|
||||
rg: rg,
|
||||
centers: make(obiutils.Matrix[float64], 0, k),
|
||||
icenters: make([]int, 0, k),
|
||||
sizes: make([]int, 0, k),
|
||||
centers: make(obiutils.Matrix[float64], 0, k),
|
||||
distmin: distmin,
|
||||
classes: make([]int, len(*data)),
|
||||
rg: rg,
|
||||
}
|
||||
|
||||
for i := 0; i < k; i++ {
|
||||
@ -81,79 +113,160 @@ func (clustering *KmeansClustering) N() int {
|
||||
func (clustering *KmeansClustering) Dimension() int {
|
||||
return len((*clustering.data)[0])
|
||||
}
|
||||
|
||||
// SetCenterTo sets the center of a specific cluster to a given data point index.
|
||||
//
|
||||
// Parameters:
|
||||
// - k: the index of the cluster, if k=-1, a new center is added
|
||||
// - i: the index of the data point
|
||||
// - reset: a boolean indicating whether to reset the distances to the nearest center
|
||||
// for points previously assigned to this center
|
||||
//
|
||||
// No return value.
|
||||
|
||||
func (clustering *KmeansClustering) SetCenterTo(k, i int, reset bool) {
|
||||
N := clustering.N()
|
||||
K := clustering.K()
|
||||
center := (*clustering.data)[i]
|
||||
|
||||
if k >= 0 {
|
||||
clustering.icenters[k] = i
|
||||
clustering.sizes[k] = 0
|
||||
clustering.centers[k] = center
|
||||
|
||||
if reset {
|
||||
// Recompute distances to the nearest center for points
|
||||
// previously assigned to this center
|
||||
K := clustering.K()
|
||||
for j := 0; j < N; j++ {
|
||||
if clustering.classes[j] == k {
|
||||
clustering.distmin[j] = math.MaxFloat64
|
||||
for l := 1; l < K; l++ {
|
||||
dist := EuclideanDist((*clustering.data)[j], clustering.centers[l])
|
||||
if dist < clustering.distmin[j] {
|
||||
clustering.distmin[j] = dist
|
||||
clustering.classes[j] = l
|
||||
}
|
||||
}
|
||||
clustering.sizes[clustering.classes[j]]++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
clustering.icenters = append(clustering.icenters, i)
|
||||
clustering.sizes = append(clustering.sizes, 0)
|
||||
clustering.centers = append(clustering.centers, center)
|
||||
k = K
|
||||
K++
|
||||
}
|
||||
|
||||
for j := 0; j < clustering.N(); j++ {
|
||||
dist := EuclideanDist((*clustering.data)[j], center)
|
||||
if dist < clustering.distmin[j] {
|
||||
if C := clustering.classes[j]; C >= 0 {
|
||||
clustering.sizes[C]--
|
||||
}
|
||||
clustering.distmin[j] = dist
|
||||
clustering.classes[j] = k
|
||||
clustering.sizes[k]++
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// AddACenter adds a new center to the KmeansClustering.
|
||||
//
|
||||
// If there are no centers, it randomly selects a new center.
|
||||
// If there are existing centers, it selects a new center with
|
||||
// probability proportional to its distance from the nearest
|
||||
// center. The center is then added to the clustering.
|
||||
func (clustering *KmeansClustering) AddACenter() {
|
||||
k := clustering.K()
|
||||
C := 0
|
||||
if clustering.K() == 0 {
|
||||
|
||||
if k == 0 {
|
||||
// if there are no centers yet, draw a sample as the first center
|
||||
C = rand.Intn(clustering.N())
|
||||
} else {
|
||||
// otherwise, draw a sample with a probability proportional
|
||||
// to its closest distance to a center
|
||||
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
|
||||
C, _ = w.Take()
|
||||
}
|
||||
clustering.icenters = append(clustering.icenters, C)
|
||||
clustering.sizes = append(clustering.sizes, 0)
|
||||
center := (*clustering.data)[C]
|
||||
clustering.centers = append(clustering.centers, center)
|
||||
|
||||
n := clustering.N()
|
||||
|
||||
for i := 0; i < n; i++ {
|
||||
d := squareDist((*clustering.data)[i], center)
|
||||
if d < clustering.distmin[i] {
|
||||
clustering.distmin[i] = d
|
||||
}
|
||||
}
|
||||
clustering.SetCenterTo(-1, C, false)
|
||||
}
|
||||
|
||||
// ResetEmptyCenters resets the empty centers in the KmeansClustering struct.
|
||||
// ResetEmptyCenters reinitializes any centers in a KmeansClustering
|
||||
// that have no assigned points.
|
||||
//
|
||||
// It iterates over the centers and checks if their corresponding sizes are zero.
|
||||
// If a center is empty, a new weighted sample is taken with the help of the distmin and rg variables.
|
||||
// The new center is then assigned to the empty center index, and the sizes and centers arrays are updated accordingly.
|
||||
// Finally, the function returns the number of empty centers that were reset.
|
||||
// This method iterates over the centers and uses a weighted sampling
|
||||
// to reset centers with a size of zero.
|
||||
// Returns the number of centers that were reset.
|
||||
func (clustering *KmeansClustering) ResetEmptyCenters() int {
|
||||
nreset := 0
|
||||
for i := 0; i < clustering.K(); i++ {
|
||||
if clustering.sizes[i] == 0 {
|
||||
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
|
||||
C, _ := w.Take()
|
||||
clustering.icenters[i] = C
|
||||
clustering.centers[i] = (*clustering.data)[C]
|
||||
clustering.SetCenterTo(i, C, false)
|
||||
nreset++
|
||||
}
|
||||
}
|
||||
return nreset
|
||||
}
|
||||
|
||||
// AssignToClass assigns each data point to a class based on the distance to the nearest center.
|
||||
// ClosestPoint finds the index of the closest point in the
|
||||
// clustering to the given coordinates.
|
||||
//
|
||||
// This function does not take any parameters.
|
||||
// It does not return anything.
|
||||
// coordinates is a slice of float64 representing the point.
|
||||
// Returns the index of the closest point as an int.
|
||||
func (clustering *KmeansClustering) ClosestPoint(coordinates []float64) int {
|
||||
N := clustering.N()
|
||||
distmin := math.MaxFloat64
|
||||
C := -1
|
||||
for i := 0; i < N; i++ {
|
||||
dist := EuclideanDist((*clustering.data)[i], coordinates)
|
||||
if dist < distmin {
|
||||
distmin = dist
|
||||
C = i
|
||||
}
|
||||
}
|
||||
return C
|
||||
}
|
||||
|
||||
// AssignToClass assigns each data point in the dataset to the nearest
|
||||
// center (class) in a K-means clustering algorithm.
|
||||
//
|
||||
// Handles the reinitialization of empty centers after the assignment.
|
||||
// No return values.
|
||||
func (clustering *KmeansClustering) AssignToClass() {
|
||||
var wg sync.WaitGroup
|
||||
var lock sync.Mutex
|
||||
|
||||
// initialize the number of points in each class
|
||||
for i := 0; i < clustering.K(); i++ {
|
||||
clustering.sizes[i] = 0
|
||||
}
|
||||
for i := 0; i < clustering.N(); i++ {
|
||||
clustering.distmin[i] = math.MaxFloat64
|
||||
}
|
||||
|
||||
goroutine := func(i int) {
|
||||
defer wg.Done()
|
||||
dmin := math.MaxFloat64
|
||||
cmin := -1
|
||||
for j, center := range clustering.centers {
|
||||
dist := squareDist((*clustering.data)[i], center)
|
||||
dist := EuclideanDist((*clustering.data)[i], center)
|
||||
if dist < dmin {
|
||||
dmin = dist
|
||||
cmin = j
|
||||
}
|
||||
}
|
||||
lock.Lock()
|
||||
|
||||
clustering.classes[i] = cmin
|
||||
clustering.sizes[cmin]++
|
||||
clustering.distmin[i] = dmin
|
||||
|
||||
lock.Lock()
|
||||
clustering.sizes[cmin]++
|
||||
lock.Unlock()
|
||||
}
|
||||
|
||||
@ -162,16 +275,45 @@ func (clustering *KmeansClustering) AssignToClass() {
|
||||
go goroutine(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
nreset := clustering.ResetEmptyCenters()
|
||||
|
||||
if nreset > 0 {
|
||||
log.Warnf("Reset %d empty centers", nreset)
|
||||
clustering.AssignToClass()
|
||||
log.Warnf("Reseted %d empty centers", nreset)
|
||||
}
|
||||
}
|
||||
|
||||
// SetCentersTo assigns new centers in the KmeansClustering
|
||||
// structure given a slice of indices.
|
||||
//
|
||||
// The indices parameter is a slice of integers that
|
||||
// corresponds to the new indices of the cluster centers in
|
||||
// the dataset. It panics if any index is out of bounds.
|
||||
// This method does not return any value.
|
||||
func (clustering *KmeansClustering) SetCentersTo(indices []int) {
|
||||
for _, v := range indices {
|
||||
if v < 0 || v >= clustering.N() {
|
||||
log.Fatalf("Invalid center index: %d", v)
|
||||
}
|
||||
}
|
||||
|
||||
clustering.icenters = indices
|
||||
K := len(indices)
|
||||
|
||||
for i := 0; i < K; i++ {
|
||||
clustering.centers[i] = (*clustering.data)[indices[i]]
|
||||
}
|
||||
|
||||
clustering.AssignToClass()
|
||||
|
||||
}
|
||||
|
||||
// ComputeCenters calculates the centers of the K-means clustering algorithm.
|
||||
//
|
||||
// This method call AssignToClass() after computing the centers to ensure coherence
|
||||
// of the clustering data structure.
|
||||
//
|
||||
// It takes no parameters.
|
||||
// It does not return any values.
|
||||
func (clustering *KmeansClustering) ComputeCenters() {
|
||||
@ -179,66 +321,54 @@ func (clustering *KmeansClustering) ComputeCenters() {
|
||||
centers := clustering.centers
|
||||
data := clustering.data
|
||||
classes := clustering.classes
|
||||
k := clustering.K()
|
||||
K := clustering.K()
|
||||
|
||||
// Goroutine code
|
||||
goroutine1 := func(centerIdx int) {
|
||||
// compute the location of center of class centerIdx
|
||||
// as the point in the data the closest to the
|
||||
// center of class centerIdx
|
||||
newCenter := func(centerIdx int) {
|
||||
defer wg.Done()
|
||||
for j, row := range *data {
|
||||
class := classes[j]
|
||||
if class == centerIdx {
|
||||
for l, val := range row {
|
||||
centers[centerIdx][l] += val
|
||||
}
|
||||
}
|
||||
|
||||
center := make([]float64, clustering.Dimension())
|
||||
|
||||
for j := range center {
|
||||
center[j] = 0
|
||||
}
|
||||
}
|
||||
|
||||
for i := 0; i < k; i++ {
|
||||
wg.Add(1)
|
||||
go goroutine1(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
for i := range centers {
|
||||
for j := range centers[i] {
|
||||
centers[i][j] /= float64(clustering.sizes[i])
|
||||
}
|
||||
}
|
||||
|
||||
goroutine2 := func(centerIdx int) {
|
||||
defer wg.Done()
|
||||
dkmin := math.MaxFloat64
|
||||
dki := -1
|
||||
center := centers[centerIdx]
|
||||
for j, row := range *data {
|
||||
if classes[j] == centerIdx {
|
||||
dist := squareDist(row, center)
|
||||
if dist < dkmin {
|
||||
dkmin = dist
|
||||
dki = j
|
||||
for l, val := range row {
|
||||
center[l] += val
|
||||
}
|
||||
}
|
||||
}
|
||||
clustering.icenters[centerIdx] = dki
|
||||
clustering.centers[centerIdx] = (*data)[dki]
|
||||
|
||||
for j := range centers[centerIdx] {
|
||||
center[j] /= float64(clustering.sizes[centerIdx])
|
||||
}
|
||||
|
||||
C := clustering.ClosestPoint(center)
|
||||
|
||||
centers[centerIdx] = (*data)[C]
|
||||
clustering.icenters[centerIdx] = C
|
||||
}
|
||||
|
||||
for i := 0; i < k; i++ {
|
||||
for i := 0; i < K; i++ {
|
||||
wg.Add(1)
|
||||
go goroutine2(i)
|
||||
go newCenter(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
clustering.AssignToClass()
|
||||
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) Inertia() float64 {
|
||||
inertia := 0.0
|
||||
|
||||
for i := 0; i < clustering.N(); i++ {
|
||||
inertia += clustering.distmin[i]
|
||||
inertia += clustering.distmin[i] * clustering.distmin[i]
|
||||
}
|
||||
return inertia
|
||||
}
|
||||
@ -264,7 +394,6 @@ func (clustering *KmeansClustering) Run(max_cycle int, threshold float64) bool {
|
||||
newI := clustering.Inertia()
|
||||
for i := 0; i < max_cycle && (prev-newI) > threshold; i++ {
|
||||
prev = newI
|
||||
clustering.AssignToClass()
|
||||
clustering.ComputeCenters()
|
||||
newI = clustering.Inertia()
|
||||
}
|
||||
|
@ -109,43 +109,26 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
|
||||
n_landmark := CLINCenter()
|
||||
|
||||
landmark_idx := obistats.SampleIntWithoutReplacement(n_landmark, library_size)
|
||||
sort.IntSlice(landmark_idx).Sort()
|
||||
log.Infof("Library contains %d sequence", len(library))
|
||||
|
||||
var seqworld obiutils.Matrix[float64]
|
||||
|
||||
for loop := 0; loop < 2; loop++ {
|
||||
sort.IntSlice(landmark_idx).Sort()
|
||||
log.Debugf("Selected indices : %v", landmark_idx)
|
||||
|
||||
seqworld = MapOnLandmarkSequences(library, landmark_idx)
|
||||
initialCenters := obiutils.Make2DArray[float64](n_landmark, n_landmark)
|
||||
for i, seq_idx := range landmark_idx {
|
||||
initialCenters[i] = seqworld[seq_idx]
|
||||
}
|
||||
|
||||
// classes, centers := obistats.Kmeans(&seqworld, n_landmark, &initialCenters)
|
||||
classifier := obistats.MakeKmeansClustering(&seqworld, n_landmark, obistats.DefaultRG())
|
||||
_, centers, inertia, converged := classifier.Run(1000, 0.001)
|
||||
intertia := classifier.Inertia()
|
||||
_, centers, inertia, converged := obistats.Kmeans(&seqworld, n_landmark, 0.001, &initialCenters)
|
||||
converged := classifier.Run(1000, 0.001)
|
||||
inertia := classifier.Inertia()
|
||||
|
||||
dist_centers := 0.0
|
||||
for i := 0; i < n_landmark; i++ {
|
||||
center := (*centers)[i]
|
||||
icenter := initialCenters[i]
|
||||
for j := 0; j < n_landmark; j++ {
|
||||
diff := center[j] - icenter[j]
|
||||
dist_centers += diff * diff
|
||||
}
|
||||
}
|
||||
|
||||
landmark_idx = obistats.KmeansBestRepresentative(&seqworld, centers)
|
||||
log.Infof("Inertia: %f, Dist centers: %f, converged: %t", inertia, dist_centers, converged)
|
||||
log.Infof("Inertia: %f, converged: %t", inertia, converged)
|
||||
|
||||
landmark_idx = classifier.CentersIndices()
|
||||
sort.IntSlice(landmark_idx).Sort()
|
||||
}
|
||||
|
||||
sort.IntSlice(landmark_idx).Sort()
|
||||
|
||||
log.Debugf("Selected indices : %v", landmark_idx)
|
||||
seqworld = MapOnLandmarkSequences(library, landmark_idx)
|
||||
|
||||
@ -159,12 +142,14 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
|
||||
initialCenters[i] = seqworld[seq_idx]
|
||||
}
|
||||
|
||||
classes := obistats.AssignToClass(&seqworld, &initialCenters)
|
||||
// classes := obistats.AssignToClass(&seqworld, &initialCenters)
|
||||
|
||||
for i, seq := range library {
|
||||
ic, _ := obiutils.InterfaceToIntSlice(seqworld[i])
|
||||
seq.SetCoordinate(ic)
|
||||
seq.SetAttribute("landmark_class", classes[i])
|
||||
// seq.SetAttribute("landmark_class", classes[i])
|
||||
|
||||
// if the sequence is a landmark sequence
|
||||
if i, ok := seq_landmark[i]; ok {
|
||||
seq.SetAttribute("landmark_id", i)
|
||||
}
|
||||
|
@ -7,6 +7,7 @@ import (
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
@ -35,12 +36,8 @@ func GeomIndexSesquence(seqidx int,
|
||||
if reflocation == nil {
|
||||
log.Fatalf("Sequence %s does not have a coordinate", ref.Id())
|
||||
}
|
||||
d := 0.0
|
||||
for i, x := range location {
|
||||
diff := float64(x - reflocation[i])
|
||||
d += diff * diff
|
||||
}
|
||||
seq_dist[i] = d
|
||||
|
||||
seq_dist[i] = obistats.SquareDist(location, reflocation)
|
||||
}(i, ref)
|
||||
}
|
||||
|
||||
@ -51,18 +48,16 @@ func GeomIndexSesquence(seqidx int,
|
||||
lca := (*taxa)[seqidx]
|
||||
|
||||
index := make(map[int]string)
|
||||
index[0.0] = fmt.Sprintf(
|
||||
index[0] = fmt.Sprintf(
|
||||
"%d@%s@%s",
|
||||
lca.Taxid(),
|
||||
lca.ScientificName(),
|
||||
lca.Rank())
|
||||
|
||||
old_dist := 0.0
|
||||
for _, o := range order {
|
||||
new_lca, _ := lca.LCA((*taxa)[o])
|
||||
if new_lca.Taxid() != lca.Taxid() || seq_dist[o] != old_dist {
|
||||
if new_lca.Taxid() != lca.Taxid() {
|
||||
lca = new_lca
|
||||
old_dist = seq_dist[o]
|
||||
index[int(seq_dist[o])] = fmt.Sprintf(
|
||||
"%d@%s@%s",
|
||||
lca.Taxid(),
|
||||
|
@ -8,6 +8,7 @@ import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
)
|
||||
|
||||
@ -74,7 +75,7 @@ func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioS
|
||||
coords := make([]int, len(*landmarks))
|
||||
|
||||
for i, l := range *landmarks {
|
||||
lcs, length := obialign.FastLCSEGFScore(sequence, l, -1, buffer)
|
||||
lcs, length := obialign.FastLCSScore(sequence, l, -1, buffer)
|
||||
coords[i] = length - lcs
|
||||
}
|
||||
|
||||
@ -98,9 +99,9 @@ func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioS
|
||||
func FindGeomClosest(sequence *obiseq.BioSequence,
|
||||
landmarks *obiseq.BioSequenceSlice,
|
||||
references *obiseq.BioSequenceSlice,
|
||||
buffer *[]uint64) (*obiseq.BioSequence, int, float64, []int, *obiseq.BioSequenceSlice) {
|
||||
buffer *[]uint64) (*obiseq.BioSequence, float64, float64, []int, *obiseq.BioSequenceSlice) {
|
||||
|
||||
min_dist := math.MaxInt64
|
||||
min_dist := math.MaxFloat64
|
||||
min_idx := make([]int, 0)
|
||||
|
||||
query_location := MapOnLandmarkSequences(sequence, landmarks, buffer)
|
||||
@ -110,11 +111,8 @@ func FindGeomClosest(sequence *obiseq.BioSequence,
|
||||
if len(coord) == 0 {
|
||||
log.Panicf("Empty coordinate for reference sequence %s", l.Id())
|
||||
}
|
||||
dist := 0
|
||||
for j := 0; j < len(coord); j++ {
|
||||
diff := query_location[j] - coord[j]
|
||||
dist += diff * diff
|
||||
}
|
||||
|
||||
dist := obistats.SquareDist(coord, query_location)
|
||||
|
||||
if dist == min_dist {
|
||||
min_idx = append(min_idx, i)
|
||||
|
@ -6,6 +6,8 @@ import (
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"golang.org/x/exp/maps"
|
||||
"golang.org/x/exp/slices"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
@ -27,22 +29,20 @@ import (
|
||||
// - taxid: The taxid associated with the matched distance.
|
||||
// - rank: The rank associated with the matched distance.
|
||||
// - scientificName: The scientific name associated with the matched distance.
|
||||
func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) {
|
||||
keys := make([]int, 0, len(distanceIdx))
|
||||
for k := range distanceIdx {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Ints(keys)
|
||||
func MatchDistanceIndex(distance float64, distanceIdx map[int]string) (int, string, string) {
|
||||
idist := int(distance)
|
||||
keys := maps.Keys(distanceIdx)
|
||||
slices.Sort(keys)
|
||||
|
||||
i := sort.Search(len(keys), func(i int) bool {
|
||||
return distance <= keys[i]
|
||||
return idist <= keys[i]
|
||||
})
|
||||
|
||||
var taxid int
|
||||
var rank string
|
||||
var scientificName string
|
||||
|
||||
if i == len(keys) || distance > keys[len(keys)-1] {
|
||||
if i == len(keys) || idist > keys[len(keys)-1] {
|
||||
taxid = 1
|
||||
rank = "no rank"
|
||||
scientificName = "root"
|
||||
|
@ -15,6 +15,12 @@ func (r intRanker) Len() int { return len(r.x) }
|
||||
func (r intRanker) Less(i, j int) bool { return r.x[r.r[i]] < r.x[r.r[j]] }
|
||||
func (r intRanker) Swap(i, j int) { r.r[i], r.r[j] = r.r[j], r.r[i] }
|
||||
|
||||
// IntOrder sorts a slice of integers and returns a slice
|
||||
// of indices that represents the order of the sorted
|
||||
// elements.
|
||||
//
|
||||
// `data` is a slice of integers to be ordered.
|
||||
// Returns a slice of the ordered indices.
|
||||
func IntOrder(data []int) []int {
|
||||
if len(data) == 0 {
|
||||
return nil
|
||||
|
Reference in New Issue
Block a user