From f2f7b4574ed0123c42eccf7039dc4897cba4e509 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 17 Jan 2024 23:38:51 +0100 Subject: [PATCH] update the geometric obitag Former-commit-id: acd8fe1c8c1cf443098432d818397b0b5d02df33 --- .gitignore | 1 + pkg/obistats/kmeans.go | 10 ++++---- pkg/obitools/obirefidx/geomindexing.go | 35 +++++++++++++++++--------- pkg/obitools/obitag/obigeomtag.go | 8 +++--- pkg/obitools/obitag/obitag.go | 7 +++--- 5 files changed, 36 insertions(+), 25 deletions(-) diff --git a/.gitignore b/.gitignore index 635bddb..215e7f9 100644 --- a/.gitignore +++ b/.gitignore @@ -116,3 +116,4 @@ doc/book/wolf_data/Release-253/ncbitaxo/names.dmp doc/book/wolf_data/Release-253/ncbitaxo/nodes.dmp doc/book/wolf_data/Release-253/ncbitaxo/readme.txt doc/book/results/toto.tasta +sample/.DS_Store diff --git a/pkg/obistats/kmeans.go b/pkg/obistats/kmeans.go index 10d59e8..b4a8eeb 100644 --- a/pkg/obistats/kmeans.go +++ b/pkg/obistats/kmeans.go @@ -15,14 +15,14 @@ import ( // SquareDist calculates the squared Euclidean distance between // two vectors 'a' and 'b'. // -// 'a' and 'b' are slices of float64 values representing +// 'a' and 'b' are slices of float64 or int values representing // coordinate points in space. It is assumed that both slices // have the same length. // Returns the calculated squared distance as a float64. -func SquareDist[T float64 | int](a, b []T) float64 { - sum := 0.0 +func SquareDist[T float64 | int](a, b []T) T { + sum := T(0) for i, v := range a { - diff := float64(v - b[i]) + diff := v - b[i] sum += diff * diff } return sum @@ -35,7 +35,7 @@ func SquareDist[T float64 | int](a, b []T) float64 { // is paired with the corresponding element of `b`. // Returns the squared sum of the differences. func EuclideanDist[T float64 | int](a, b []T) float64 { - return math.Sqrt(SquareDist(a, b)) + return math.Sqrt(float64(SquareDist(a, b))) } // DefaultRG creates and returns a new instance of *rand.Rand. diff --git a/pkg/obitools/obirefidx/geomindexing.go b/pkg/obitools/obirefidx/geomindexing.go index 06e7dd0..2a4157c 100644 --- a/pkg/obitools/obirefidx/geomindexing.go +++ b/pkg/obitools/obirefidx/geomindexing.go @@ -6,6 +6,7 @@ import ( "sort" "sync" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" @@ -24,26 +25,36 @@ func GeomIndexSesquence(seqidx int, log.Fatalf("Sequence %s does not have a coordinate", sequence.Id()) } - seq_dist := make([]float64, len(references)) + seq_dist := make([]int, len(references)) var wg sync.WaitGroup - for i, ref := range references { - wg.Add(1) - go func(i int, ref *obiseq.BioSequence) { - defer wg.Done() - reflocation := ref.GetCoordinate() - if reflocation == nil { - log.Fatalf("Sequence %s does not have a coordinate", ref.Id()) - } + iseq_channel := make(chan int) - seq_dist[i] = obistats.SquareDist(location, reflocation) - }(i, ref) + for k := 0; k < obioptions.CLIParallelWorkers(); k++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := range iseq_channel { + ref := references[i] + reflocation := ref.GetCoordinate() + if reflocation == nil { + log.Fatalf("Sequence %s does not have a coordinate", ref.Id()) + } + + seq_dist[i] = obistats.SquareDist(location, reflocation) + } + }() } + for i := range references { + iseq_channel <- i + } + + close(iseq_channel) wg.Wait() - order := obiutils.Order(sort.Float64Slice(seq_dist)) + order := obiutils.Order(sort.IntSlice(seq_dist)) lca := (*taxa)[seqidx] diff --git a/pkg/obitools/obitag/obigeomtag.go b/pkg/obitools/obitag/obigeomtag.go index f75eec6..0eb2118 100644 --- a/pkg/obitools/obitag/obigeomtag.go +++ b/pkg/obitools/obitag/obigeomtag.go @@ -1,7 +1,7 @@ package obitag import ( - "log" + log "github.com/sirupsen/logrus" "math" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" @@ -99,9 +99,9 @@ func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioS func FindGeomClosest(sequence *obiseq.BioSequence, landmarks *obiseq.BioSequenceSlice, references *obiseq.BioSequenceSlice, - buffer *[]uint64) (*obiseq.BioSequence, float64, float64, []int, *obiseq.BioSequenceSlice) { + buffer *[]uint64) (*obiseq.BioSequence, int, float64, []int, *obiseq.BioSequenceSlice) { - min_dist := math.MaxFloat64 + min_dist := math.MaxInt min_idx := make([]int, 0) query_location := MapOnLandmarkSequences(sequence, landmarks, buffer) @@ -129,7 +129,7 @@ func FindGeomClosest(sequence *obiseq.BioSequence, for _, i := range min_idx { seq := (*references)[i] - lcs, length := obialign.FastLCSEGFScore(sequence, seq, -1, buffer) + lcs, length := obialign.FastLCSScore(sequence, seq, -1, buffer) ident := float64(lcs) / float64(length) if ident > best_id { best_id = ident diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go index 6cac5f3..0f6f5cd 100644 --- a/pkg/obitools/obitag/obitag.go +++ b/pkg/obitools/obitag/obitag.go @@ -29,20 +29,19 @@ import ( // - taxid: The taxid associated with the matched distance. // - rank: The rank associated with the matched distance. // - scientificName: The scientific name associated with the matched distance. -func MatchDistanceIndex(distance float64, distanceIdx map[int]string) (int, string, string) { - idist := int(distance) +func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) { keys := maps.Keys(distanceIdx) slices.Sort(keys) i := sort.Search(len(keys), func(i int) bool { - return idist <= keys[i] + return distance <= keys[i] }) var taxid int var rank string var scientificName string - if i == len(keys) || idist > keys[len(keys)-1] { + if i == len(keys) || distance > keys[len(keys)-1] { taxid = 1 rank = "no rank" scientificName = "root"