From fd663357b50d2c4a9da7083d97aac3dde415888f Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 1 Jul 2024 17:12:42 +0200 Subject: [PATCH] First version of obicleandb... Former-commit-id: e60b61d015abbf029a555b51de99b4252c50ab59 --- pkg/obioptions/version.go | 2 +- pkg/obiseq/attributes.go | 20 +++++ pkg/obistats/stats.go | 38 +++++++++ pkg/obitools/obicleandb/obicleandb.go | 111 +++++++++++++++++++++++++- 4 files changed, 168 insertions(+), 3 deletions(-) create mode 100644 pkg/obistats/stats.go diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index b8551cd..2b6fa2a 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "86d208f" +var _Commit = "ea14e1a" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/attributes.go b/pkg/obiseq/attributes.go index 254437d..ba36256 100644 --- a/pkg/obiseq/attributes.go +++ b/pkg/obiseq/attributes.go @@ -172,6 +172,26 @@ func (s *BioSequence) GetIntAttribute(key string) (int, bool) { return val, ok } +func (s *BioSequence) GetFloatAttribute(key string) (float64, bool) { + var val float64 + var err error + + v, ok := s.GetAttribute(key) + + if ok { + val, ok = v.(float64) + if !ok { + val, err = obiutils.InterfaceToFloat64(v) + ok = err == nil + if ok { + s.SetAttribute(key, val) + } + } + } + + return val, ok +} + // DeleteAttribute deletes the attribute with the given key from the BioSequence. // // Parameters: diff --git a/pkg/obistats/stats.go b/pkg/obistats/stats.go new file mode 100644 index 0000000..23cdaa2 --- /dev/null +++ b/pkg/obistats/stats.go @@ -0,0 +1,38 @@ +package obistats + +import ( + "slices" + + "golang.org/x/exp/constraints" +) + +type Number interface { + constraints.Float | constraints.Integer +} + +func Median[T Number](data []T) float64 { + dataCopy := make([]T, len(data)) + copy(dataCopy, data) + + slices.Sort(dataCopy) + + var median float64 + l := len(dataCopy) + if l == 0 { + return 0 + } else if l%2 == 0 { + median = float64((dataCopy[l/2-1] + dataCopy[l/2]) / 2.0) + } else { + median = float64(dataCopy[l/2]) + } + + return median +} + +func Mean[T Number](data []T) float64 { + var sum float64 + for _, v := range data { + sum += float64(v) + } + return sum / float64(len(data)) +} diff --git a/pkg/obitools/obicleandb/obicleandb.go b/pkg/obitools/obicleandb/obicleandb.go index 18234d6..a3cb7fe 100644 --- a/pkg/obitools/obicleandb/obicleandb.go +++ b/pkg/obitools/obicleandb/obicleandb.go @@ -3,13 +3,105 @@ package obicleandb import ( log "github.com/sirupsen/logrus" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obichunk" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obigrep" ) +func SequenceTrust(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + sequence.SetAttribute("obicleandb_trusted", 1.0-1.0/float64(sequence.Count()+1)) + sequence.SetAttribute("obicleandb_trusted_on", float64(sequence.Count())) + return obiseq.BioSequenceSlice{sequence}, nil +} + +func diagCoord(x, y, n int) int { + if x > y { + x, y = y, x + } + + if x == y { + log.Panicf("diagCoord: (%d == %d)", x, y) + } + + sn := n * (n - 1) / 2 + sx := (n - x) * (n - (1 + x)) / 2 + s := sn - sx + + return s + y - x - 1 +} +func SequenceTrustSlice(sequences obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) { + n := len(sequences) + score := make([]float64, n*(n-1)/2) + matrix := make([]uint64, sequences[0].Len()*sequences[0].Len()) + + for i, sa := range sequences { + for j, sb := range sequences[i+1:] { + lca, lali := obialign.FastLCSScore(sa, sb, -1, &matrix) + score[diagCoord(i, i+1+j, n)] = float64(lca) / float64(lali) + } + } + + for i, sa := range sequences { + ss := make([]float64, 0, n-1) + for j, _ := range sequences { + if i == j { + continue + } + + s := score[diagCoord(i, j, n)] + if s == 0.0 { + log.Panicf("score[%d, %d] == 0.0", i, j) + } + ss = append(ss, score[diagCoord(i, j, n)]) + + } + sa.SetAttribute("obicleandb_dist", ss) + } + + scoremed := obistats.Median(score) + scorethr := 1 - 2*(1-scoremed) + mednorm := (scoremed - scorethr) / 2.0 + + for i, s := range score { + switch { + case s < scorethr: + score[i] = -1.0 + case s < scoremed: + score[i] = (s-scorethr)/mednorm - 1.0 + default: + score[i] = 1.0 + } + } + + // Tylos + for i, sa := range sequences { + ngroup := float64(sa.Count()) + ss := make([]float64, 0, n-1) + sc := sa.Count() + for j, sb := range sequences { + if i == j { + continue + } + + ss = append(ss, score[diagCoord(i, j, n)]) + sc += sb.Count() + nt, _ := sb.GetFloatAttribute("obicleandb_trusted_on") + ngroup += score[diagCoord(i, j, n)] * nt + } + ngroup = max(0, ngroup) + sa.SetAttribute("obicleandb_trusted", 1.0-1.0/float64(ngroup+1)) + sa.SetAttribute("obicleandb_on", ngroup) + sa.SetAttribute("obicleandb_median", scoremed) + sa.SetAttribute("obicleandb_ss", ss) + } + + return sequences, nil +} + func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence { var rankPredicate obiseq.SequencePredicate @@ -58,9 +150,24 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence { ).MakeIWorker(taxonomy.MakeSetFamilyWorker(), false, obioptions.CLIParallelWorkers(), + ).MakeIWorker(SequenceTrust, + false, + obioptions.CLIParallelWorkers(), ) - // annotated.MakeIConditionalWorker(obiseq.IsMoreAbundantOrEqualTo(3),1000) + genera_iterator, err := obichunk.ISequenceChunk( + annotated, + obiseq.AnnotationClassifier("genus_taxid", "NA"), + ) - return annotated + if err != nil { + log.Fatal(err) + } + + trusted := genera_iterator.MakeISliceWorker( + SequenceTrustSlice, + false, + ) + + return trusted }