mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
First version of obicleandb...
Former-commit-id: e60b61d015abbf029a555b51de99b4252c50ab59
This commit is contained in:
@ -7,7 +7,7 @@ import (
|
|||||||
// TODO: The version number is extracted from git. This induces that the version
|
// TODO: The version number is extracted from git. This induces that the version
|
||||||
// corresponds to the last commit, and not the one when the file will be
|
// corresponds to the last commit, and not the one when the file will be
|
||||||
// commited
|
// commited
|
||||||
var _Commit = "86d208f"
|
var _Commit = "ea14e1a"
|
||||||
var _Version = "Release 4.2.0"
|
var _Version = "Release 4.2.0"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
|
@ -172,6 +172,26 @@ func (s *BioSequence) GetIntAttribute(key string) (int, bool) {
|
|||||||
return val, ok
|
return val, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *BioSequence) GetFloatAttribute(key string) (float64, bool) {
|
||||||
|
var val float64
|
||||||
|
var err error
|
||||||
|
|
||||||
|
v, ok := s.GetAttribute(key)
|
||||||
|
|
||||||
|
if ok {
|
||||||
|
val, ok = v.(float64)
|
||||||
|
if !ok {
|
||||||
|
val, err = obiutils.InterfaceToFloat64(v)
|
||||||
|
ok = err == nil
|
||||||
|
if ok {
|
||||||
|
s.SetAttribute(key, val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return val, ok
|
||||||
|
}
|
||||||
|
|
||||||
// DeleteAttribute deletes the attribute with the given key from the BioSequence.
|
// DeleteAttribute deletes the attribute with the given key from the BioSequence.
|
||||||
//
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
|
38
pkg/obistats/stats.go
Normal file
38
pkg/obistats/stats.go
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
package obistats
|
||||||
|
|
||||||
|
import (
|
||||||
|
"slices"
|
||||||
|
|
||||||
|
"golang.org/x/exp/constraints"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Number interface {
|
||||||
|
constraints.Float | constraints.Integer
|
||||||
|
}
|
||||||
|
|
||||||
|
func Median[T Number](data []T) float64 {
|
||||||
|
dataCopy := make([]T, len(data))
|
||||||
|
copy(dataCopy, data)
|
||||||
|
|
||||||
|
slices.Sort(dataCopy)
|
||||||
|
|
||||||
|
var median float64
|
||||||
|
l := len(dataCopy)
|
||||||
|
if l == 0 {
|
||||||
|
return 0
|
||||||
|
} else if l%2 == 0 {
|
||||||
|
median = float64((dataCopy[l/2-1] + dataCopy[l/2]) / 2.0)
|
||||||
|
} else {
|
||||||
|
median = float64(dataCopy[l/2])
|
||||||
|
}
|
||||||
|
|
||||||
|
return median
|
||||||
|
}
|
||||||
|
|
||||||
|
func Mean[T Number](data []T) float64 {
|
||||||
|
var sum float64
|
||||||
|
for _, v := range data {
|
||||||
|
sum += float64(v)
|
||||||
|
}
|
||||||
|
return sum / float64(len(data))
|
||||||
|
}
|
@ -3,13 +3,105 @@ package obicleandb
|
|||||||
import (
|
import (
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obichunk"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obichunk"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obigrep"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obigrep"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func SequenceTrust(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||||
|
sequence.SetAttribute("obicleandb_trusted", 1.0-1.0/float64(sequence.Count()+1))
|
||||||
|
sequence.SetAttribute("obicleandb_trusted_on", float64(sequence.Count()))
|
||||||
|
return obiseq.BioSequenceSlice{sequence}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func diagCoord(x, y, n int) int {
|
||||||
|
if x > y {
|
||||||
|
x, y = y, x
|
||||||
|
}
|
||||||
|
|
||||||
|
if x == y {
|
||||||
|
log.Panicf("diagCoord: (%d == %d)", x, y)
|
||||||
|
}
|
||||||
|
|
||||||
|
sn := n * (n - 1) / 2
|
||||||
|
sx := (n - x) * (n - (1 + x)) / 2
|
||||||
|
s := sn - sx
|
||||||
|
|
||||||
|
return s + y - x - 1
|
||||||
|
}
|
||||||
|
func SequenceTrustSlice(sequences obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) {
|
||||||
|
n := len(sequences)
|
||||||
|
score := make([]float64, n*(n-1)/2)
|
||||||
|
matrix := make([]uint64, sequences[0].Len()*sequences[0].Len())
|
||||||
|
|
||||||
|
for i, sa := range sequences {
|
||||||
|
for j, sb := range sequences[i+1:] {
|
||||||
|
lca, lali := obialign.FastLCSScore(sa, sb, -1, &matrix)
|
||||||
|
score[diagCoord(i, i+1+j, n)] = float64(lca) / float64(lali)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, sa := range sequences {
|
||||||
|
ss := make([]float64, 0, n-1)
|
||||||
|
for j, _ := range sequences {
|
||||||
|
if i == j {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
s := score[diagCoord(i, j, n)]
|
||||||
|
if s == 0.0 {
|
||||||
|
log.Panicf("score[%d, %d] == 0.0", i, j)
|
||||||
|
}
|
||||||
|
ss = append(ss, score[diagCoord(i, j, n)])
|
||||||
|
|
||||||
|
}
|
||||||
|
sa.SetAttribute("obicleandb_dist", ss)
|
||||||
|
}
|
||||||
|
|
||||||
|
scoremed := obistats.Median(score)
|
||||||
|
scorethr := 1 - 2*(1-scoremed)
|
||||||
|
mednorm := (scoremed - scorethr) / 2.0
|
||||||
|
|
||||||
|
for i, s := range score {
|
||||||
|
switch {
|
||||||
|
case s < scorethr:
|
||||||
|
score[i] = -1.0
|
||||||
|
case s < scoremed:
|
||||||
|
score[i] = (s-scorethr)/mednorm - 1.0
|
||||||
|
default:
|
||||||
|
score[i] = 1.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tylos
|
||||||
|
for i, sa := range sequences {
|
||||||
|
ngroup := float64(sa.Count())
|
||||||
|
ss := make([]float64, 0, n-1)
|
||||||
|
sc := sa.Count()
|
||||||
|
for j, sb := range sequences {
|
||||||
|
if i == j {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
ss = append(ss, score[diagCoord(i, j, n)])
|
||||||
|
sc += sb.Count()
|
||||||
|
nt, _ := sb.GetFloatAttribute("obicleandb_trusted_on")
|
||||||
|
ngroup += score[diagCoord(i, j, n)] * nt
|
||||||
|
}
|
||||||
|
ngroup = max(0, ngroup)
|
||||||
|
sa.SetAttribute("obicleandb_trusted", 1.0-1.0/float64(ngroup+1))
|
||||||
|
sa.SetAttribute("obicleandb_on", ngroup)
|
||||||
|
sa.SetAttribute("obicleandb_median", scoremed)
|
||||||
|
sa.SetAttribute("obicleandb_ss", ss)
|
||||||
|
}
|
||||||
|
|
||||||
|
return sequences, nil
|
||||||
|
}
|
||||||
|
|
||||||
func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||||
var rankPredicate obiseq.SequencePredicate
|
var rankPredicate obiseq.SequencePredicate
|
||||||
|
|
||||||
@ -58,9 +150,24 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
).MakeIWorker(taxonomy.MakeSetFamilyWorker(),
|
).MakeIWorker(taxonomy.MakeSetFamilyWorker(),
|
||||||
false,
|
false,
|
||||||
obioptions.CLIParallelWorkers(),
|
obioptions.CLIParallelWorkers(),
|
||||||
|
).MakeIWorker(SequenceTrust,
|
||||||
|
false,
|
||||||
|
obioptions.CLIParallelWorkers(),
|
||||||
)
|
)
|
||||||
|
|
||||||
// annotated.MakeIConditionalWorker(obiseq.IsMoreAbundantOrEqualTo(3),1000)
|
genera_iterator, err := obichunk.ISequenceChunk(
|
||||||
|
annotated,
|
||||||
|
obiseq.AnnotationClassifier("genus_taxid", "NA"),
|
||||||
|
)
|
||||||
|
|
||||||
return annotated
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
trusted := genera_iterator.MakeISliceWorker(
|
||||||
|
SequenceTrustSlice,
|
||||||
|
false,
|
||||||
|
)
|
||||||
|
|
||||||
|
return trusted
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user