Implements the kmeans++ algo to select the landmarks in the geometric method

Former-commit-id: 732404a0dc6d7276e4e479dd2481aa4bd42d4ce5
This commit is contained in:
2023-12-11 16:07:03 +01:00
parent 37c3e16d5d
commit 2caaa62485
8 changed files with 259 additions and 140 deletions

View File

@@ -109,43 +109,26 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
n_landmark := CLINCenter()
landmark_idx := obistats.SampleIntWithoutReplacement(n_landmark, library_size)
sort.IntSlice(landmark_idx).Sort()
log.Infof("Library contains %d sequence", len(library))
var seqworld obiutils.Matrix[float64]
for loop := 0; loop < 2; loop++ {
sort.IntSlice(landmark_idx).Sort()
log.Debugf("Selected indices : %v", landmark_idx)
seqworld = MapOnLandmarkSequences(library, landmark_idx)
initialCenters := obiutils.Make2DArray[float64](n_landmark, n_landmark)
for i, seq_idx := range landmark_idx {
initialCenters[i] = seqworld[seq_idx]
}
// classes, centers := obistats.Kmeans(&seqworld, n_landmark, &initialCenters)
classifier := obistats.MakeKmeansClustering(&seqworld, n_landmark, obistats.DefaultRG())
_, centers, inertia, converged := classifier.Run(1000, 0.001)
intertia := classifier.Inertia()
_, centers, inertia, converged := obistats.Kmeans(&seqworld, n_landmark, 0.001, &initialCenters)
converged := classifier.Run(1000, 0.001)
inertia := classifier.Inertia()
dist_centers := 0.0
for i := 0; i < n_landmark; i++ {
center := (*centers)[i]
icenter := initialCenters[i]
for j := 0; j < n_landmark; j++ {
diff := center[j] - icenter[j]
dist_centers += diff * diff
}
}
landmark_idx = obistats.KmeansBestRepresentative(&seqworld, centers)
log.Infof("Inertia: %f, Dist centers: %f, converged: %t", inertia, dist_centers, converged)
log.Infof("Inertia: %f, converged: %t", inertia, converged)
landmark_idx = classifier.CentersIndices()
sort.IntSlice(landmark_idx).Sort()
}
sort.IntSlice(landmark_idx).Sort()
log.Debugf("Selected indices : %v", landmark_idx)
seqworld = MapOnLandmarkSequences(library, landmark_idx)
@@ -159,12 +142,14 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
initialCenters[i] = seqworld[seq_idx]
}
classes := obistats.AssignToClass(&seqworld, &initialCenters)
// classes := obistats.AssignToClass(&seqworld, &initialCenters)
for i, seq := range library {
ic, _ := obiutils.InterfaceToIntSlice(seqworld[i])
seq.SetCoordinate(ic)
seq.SetAttribute("landmark_class", classes[i])
// seq.SetAttribute("landmark_class", classes[i])
// if the sequence is a landmark sequence
if i, ok := seq_landmark[i]; ok {
seq.SetAttribute("landmark_id", i)
}

View File

@@ -7,6 +7,7 @@ import (
"sync"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
@@ -35,12 +36,8 @@ func GeomIndexSesquence(seqidx int,
if reflocation == nil {
log.Fatalf("Sequence %s does not have a coordinate", ref.Id())
}
d := 0.0
for i, x := range location {
diff := float64(x - reflocation[i])
d += diff * diff
}
seq_dist[i] = d
seq_dist[i] = obistats.SquareDist(location, reflocation)
}(i, ref)
}
@@ -51,18 +48,16 @@ func GeomIndexSesquence(seqidx int,
lca := (*taxa)[seqidx]
index := make(map[int]string)
index[0.0] = fmt.Sprintf(
index[0] = fmt.Sprintf(
"%d@%s@%s",
lca.Taxid(),
lca.ScientificName(),
lca.Rank())
old_dist := 0.0
for _, o := range order {
new_lca, _ := lca.LCA((*taxa)[o])
if new_lca.Taxid() != lca.Taxid() || seq_dist[o] != old_dist {
if new_lca.Taxid() != lca.Taxid() {
lca = new_lca
old_dist = seq_dist[o]
index[int(seq_dist[o])] = fmt.Sprintf(
"%d@%s@%s",
lca.Taxid(),

View File

@@ -8,6 +8,7 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
)
@@ -74,7 +75,7 @@ func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioS
coords := make([]int, len(*landmarks))
for i, l := range *landmarks {
lcs, length := obialign.FastLCSEGFScore(sequence, l, -1, buffer)
lcs, length := obialign.FastLCSScore(sequence, l, -1, buffer)
coords[i] = length - lcs
}
@@ -98,9 +99,9 @@ func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioS
func FindGeomClosest(sequence *obiseq.BioSequence,
landmarks *obiseq.BioSequenceSlice,
references *obiseq.BioSequenceSlice,
buffer *[]uint64) (*obiseq.BioSequence, int, float64, []int, *obiseq.BioSequenceSlice) {
buffer *[]uint64) (*obiseq.BioSequence, float64, float64, []int, *obiseq.BioSequenceSlice) {
min_dist := math.MaxInt64
min_dist := math.MaxFloat64
min_idx := make([]int, 0)
query_location := MapOnLandmarkSequences(sequence, landmarks, buffer)
@@ -110,11 +111,8 @@ func FindGeomClosest(sequence *obiseq.BioSequence,
if len(coord) == 0 {
log.Panicf("Empty coordinate for reference sequence %s", l.Id())
}
dist := 0
for j := 0; j < len(coord); j++ {
diff := query_location[j] - coord[j]
dist += diff * diff
}
dist := obistats.SquareDist(coord, query_location)
if dist == min_dist {
min_idx = append(min_idx, i)

View File

@@ -6,6 +6,8 @@ import (
"strings"
log "github.com/sirupsen/logrus"
"golang.org/x/exp/maps"
"golang.org/x/exp/slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
@@ -27,22 +29,20 @@ import (
// - taxid: The taxid associated with the matched distance.
// - rank: The rank associated with the matched distance.
// - scientificName: The scientific name associated with the matched distance.
func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) {
keys := make([]int, 0, len(distanceIdx))
for k := range distanceIdx {
keys = append(keys, k)
}
sort.Ints(keys)
func MatchDistanceIndex(distance float64, distanceIdx map[int]string) (int, string, string) {
idist := int(distance)
keys := maps.Keys(distanceIdx)
slices.Sort(keys)
i := sort.Search(len(keys), func(i int) bool {
return distance <= keys[i]
return idist <= keys[i]
})
var taxid int
var rank string
var scientificName string
if i == len(keys) || distance > keys[len(keys)-1] {
if i == len(keys) || idist > keys[len(keys)-1] {
taxid = 1
rank = "no rank"
scientificName = "root"