mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-13 18:50:26 +00:00
Implements the kmeans++ algo to select the landmarks in the geometric method
Former-commit-id: 732404a0dc6d7276e4e479dd2481aa4bd42d4ce5
This commit is contained in:
@@ -109,43 +109,26 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
|
||||
n_landmark := CLINCenter()
|
||||
|
||||
landmark_idx := obistats.SampleIntWithoutReplacement(n_landmark, library_size)
|
||||
sort.IntSlice(landmark_idx).Sort()
|
||||
log.Infof("Library contains %d sequence", len(library))
|
||||
|
||||
var seqworld obiutils.Matrix[float64]
|
||||
|
||||
for loop := 0; loop < 2; loop++ {
|
||||
sort.IntSlice(landmark_idx).Sort()
|
||||
log.Debugf("Selected indices : %v", landmark_idx)
|
||||
|
||||
seqworld = MapOnLandmarkSequences(library, landmark_idx)
|
||||
initialCenters := obiutils.Make2DArray[float64](n_landmark, n_landmark)
|
||||
for i, seq_idx := range landmark_idx {
|
||||
initialCenters[i] = seqworld[seq_idx]
|
||||
}
|
||||
|
||||
// classes, centers := obistats.Kmeans(&seqworld, n_landmark, &initialCenters)
|
||||
classifier := obistats.MakeKmeansClustering(&seqworld, n_landmark, obistats.DefaultRG())
|
||||
_, centers, inertia, converged := classifier.Run(1000, 0.001)
|
||||
intertia := classifier.Inertia()
|
||||
_, centers, inertia, converged := obistats.Kmeans(&seqworld, n_landmark, 0.001, &initialCenters)
|
||||
converged := classifier.Run(1000, 0.001)
|
||||
inertia := classifier.Inertia()
|
||||
|
||||
dist_centers := 0.0
|
||||
for i := 0; i < n_landmark; i++ {
|
||||
center := (*centers)[i]
|
||||
icenter := initialCenters[i]
|
||||
for j := 0; j < n_landmark; j++ {
|
||||
diff := center[j] - icenter[j]
|
||||
dist_centers += diff * diff
|
||||
}
|
||||
}
|
||||
|
||||
landmark_idx = obistats.KmeansBestRepresentative(&seqworld, centers)
|
||||
log.Infof("Inertia: %f, Dist centers: %f, converged: %t", inertia, dist_centers, converged)
|
||||
log.Infof("Inertia: %f, converged: %t", inertia, converged)
|
||||
|
||||
landmark_idx = classifier.CentersIndices()
|
||||
sort.IntSlice(landmark_idx).Sort()
|
||||
}
|
||||
|
||||
sort.IntSlice(landmark_idx).Sort()
|
||||
|
||||
log.Debugf("Selected indices : %v", landmark_idx)
|
||||
seqworld = MapOnLandmarkSequences(library, landmark_idx)
|
||||
|
||||
@@ -159,12 +142,14 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
|
||||
initialCenters[i] = seqworld[seq_idx]
|
||||
}
|
||||
|
||||
classes := obistats.AssignToClass(&seqworld, &initialCenters)
|
||||
// classes := obistats.AssignToClass(&seqworld, &initialCenters)
|
||||
|
||||
for i, seq := range library {
|
||||
ic, _ := obiutils.InterfaceToIntSlice(seqworld[i])
|
||||
seq.SetCoordinate(ic)
|
||||
seq.SetAttribute("landmark_class", classes[i])
|
||||
// seq.SetAttribute("landmark_class", classes[i])
|
||||
|
||||
// if the sequence is a landmark sequence
|
||||
if i, ok := seq_landmark[i]; ok {
|
||||
seq.SetAttribute("landmark_id", i)
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
@@ -35,12 +36,8 @@ func GeomIndexSesquence(seqidx int,
|
||||
if reflocation == nil {
|
||||
log.Fatalf("Sequence %s does not have a coordinate", ref.Id())
|
||||
}
|
||||
d := 0.0
|
||||
for i, x := range location {
|
||||
diff := float64(x - reflocation[i])
|
||||
d += diff * diff
|
||||
}
|
||||
seq_dist[i] = d
|
||||
|
||||
seq_dist[i] = obistats.SquareDist(location, reflocation)
|
||||
}(i, ref)
|
||||
}
|
||||
|
||||
@@ -51,18 +48,16 @@ func GeomIndexSesquence(seqidx int,
|
||||
lca := (*taxa)[seqidx]
|
||||
|
||||
index := make(map[int]string)
|
||||
index[0.0] = fmt.Sprintf(
|
||||
index[0] = fmt.Sprintf(
|
||||
"%d@%s@%s",
|
||||
lca.Taxid(),
|
||||
lca.ScientificName(),
|
||||
lca.Rank())
|
||||
|
||||
old_dist := 0.0
|
||||
for _, o := range order {
|
||||
new_lca, _ := lca.LCA((*taxa)[o])
|
||||
if new_lca.Taxid() != lca.Taxid() || seq_dist[o] != old_dist {
|
||||
if new_lca.Taxid() != lca.Taxid() {
|
||||
lca = new_lca
|
||||
old_dist = seq_dist[o]
|
||||
index[int(seq_dist[o])] = fmt.Sprintf(
|
||||
"%d@%s@%s",
|
||||
lca.Taxid(),
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
)
|
||||
|
||||
@@ -74,7 +75,7 @@ func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioS
|
||||
coords := make([]int, len(*landmarks))
|
||||
|
||||
for i, l := range *landmarks {
|
||||
lcs, length := obialign.FastLCSEGFScore(sequence, l, -1, buffer)
|
||||
lcs, length := obialign.FastLCSScore(sequence, l, -1, buffer)
|
||||
coords[i] = length - lcs
|
||||
}
|
||||
|
||||
@@ -98,9 +99,9 @@ func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioS
|
||||
func FindGeomClosest(sequence *obiseq.BioSequence,
|
||||
landmarks *obiseq.BioSequenceSlice,
|
||||
references *obiseq.BioSequenceSlice,
|
||||
buffer *[]uint64) (*obiseq.BioSequence, int, float64, []int, *obiseq.BioSequenceSlice) {
|
||||
buffer *[]uint64) (*obiseq.BioSequence, float64, float64, []int, *obiseq.BioSequenceSlice) {
|
||||
|
||||
min_dist := math.MaxInt64
|
||||
min_dist := math.MaxFloat64
|
||||
min_idx := make([]int, 0)
|
||||
|
||||
query_location := MapOnLandmarkSequences(sequence, landmarks, buffer)
|
||||
@@ -110,11 +111,8 @@ func FindGeomClosest(sequence *obiseq.BioSequence,
|
||||
if len(coord) == 0 {
|
||||
log.Panicf("Empty coordinate for reference sequence %s", l.Id())
|
||||
}
|
||||
dist := 0
|
||||
for j := 0; j < len(coord); j++ {
|
||||
diff := query_location[j] - coord[j]
|
||||
dist += diff * diff
|
||||
}
|
||||
|
||||
dist := obistats.SquareDist(coord, query_location)
|
||||
|
||||
if dist == min_dist {
|
||||
min_idx = append(min_idx, i)
|
||||
|
||||
@@ -6,6 +6,8 @@ import (
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"golang.org/x/exp/maps"
|
||||
"golang.org/x/exp/slices"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
@@ -27,22 +29,20 @@ import (
|
||||
// - taxid: The taxid associated with the matched distance.
|
||||
// - rank: The rank associated with the matched distance.
|
||||
// - scientificName: The scientific name associated with the matched distance.
|
||||
func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) {
|
||||
keys := make([]int, 0, len(distanceIdx))
|
||||
for k := range distanceIdx {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Ints(keys)
|
||||
func MatchDistanceIndex(distance float64, distanceIdx map[int]string) (int, string, string) {
|
||||
idist := int(distance)
|
||||
keys := maps.Keys(distanceIdx)
|
||||
slices.Sort(keys)
|
||||
|
||||
i := sort.Search(len(keys), func(i int) bool {
|
||||
return distance <= keys[i]
|
||||
return idist <= keys[i]
|
||||
})
|
||||
|
||||
var taxid int
|
||||
var rank string
|
||||
var scientificName string
|
||||
|
||||
if i == len(keys) || distance > keys[len(keys)-1] {
|
||||
if i == len(keys) || idist > keys[len(keys)-1] {
|
||||
taxid = 1
|
||||
rank = "no rank"
|
||||
scientificName = "root"
|
||||
|
||||
Reference in New Issue
Block a user