mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
first version of obidemerge, obijoin and a new filter for obicleandb but to be finnished
Former-commit-id: 8a1ed26e5548c30db75644c294d478ec4d753f19
This commit is contained in:
@ -20,6 +20,24 @@
|
||||
|
||||
### New features
|
||||
|
||||
- Adds a new `obijoin` utility to join information contained in a sequence
|
||||
file with that contained in another sequence or CSV file. The command allows
|
||||
you to specify the names of the keys in the main sequence file and in the
|
||||
secondary data file that will be used to perform the join operation.
|
||||
|
||||
- Adds a new tool `obidemerge` to demerge a `merge_xxx` slot by recreating the
|
||||
multiple identical sequences having the slot `xxx` recreated with its initial
|
||||
value and the sequence count set to the number of occurences refered in the
|
||||
`merge_xxx` slot. During the operation, the `merge_xxx` slot is removed.
|
||||
|
||||
- Adds CSV as one of the input format for every obitools command. To encode
|
||||
sequence the CSV file must includes a column named `sequence` and another
|
||||
column named `id`. An extra column named `qualities` can be added to specify
|
||||
the quality scores of the sequence following the same ascii encoding than the
|
||||
fastq format. All the other columns will be considered as annotations and will
|
||||
be interpreted as JSON objects encoding potentially for atomic values. If a
|
||||
calumn value can not be decoded as JSON it will be considered as a string.
|
||||
|
||||
- A new option **--version** has been added to every obitools command. It will
|
||||
print the version of the command.
|
||||
|
||||
|
1
go.mod
1
go.mod
@ -32,6 +32,7 @@ require (
|
||||
github.com/klauspost/cpuid/v2 v2.0.9 // indirect
|
||||
github.com/kr/pretty v0.3.0 // indirect
|
||||
github.com/kr/text v0.2.0 // indirect
|
||||
github.com/montanaflynn/stats v0.7.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/rogpeppe/go-internal v1.6.1 // indirect
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
|
||||
|
2
go.sum
2
go.sum
@ -58,6 +58,8 @@ github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZ
|
||||
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
|
||||
github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE=
|
||||
github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow=
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
|
@ -7,7 +7,7 @@ import (
|
||||
// TODO: The version number is extracted from git. This induces that the version
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
var _Commit = "2bc540d"
|
||||
var _Commit = "a365bb6"
|
||||
var _Version = "Release 4.2.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
|
@ -49,6 +49,8 @@ func MakeStatsOnDescription(descriptor string) StatsOnDescription {
|
||||
}
|
||||
}
|
||||
|
||||
var _merge_prefix = "merged_"
|
||||
|
||||
// StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute.
|
||||
//
|
||||
// Parameters:
|
||||
@ -57,7 +59,7 @@ func MakeStatsOnDescription(descriptor string) StatsOnDescription {
|
||||
// Return type:
|
||||
// - string
|
||||
func StatsOnSlotName(key string) string {
|
||||
return "merged_" + key
|
||||
return _merge_prefix + key
|
||||
}
|
||||
|
||||
// HasStatsOn tests if the sequence has already a slot summarizing statistics of occurrence for a given attribute.
|
||||
@ -115,12 +117,12 @@ func (sequence *BioSequence) StatsOn(desc StatsOnDescription, na string) StatsOn
|
||||
}
|
||||
}
|
||||
default:
|
||||
stats = make(StatsOnValues, 100)
|
||||
stats = make(StatsOnValues, 10)
|
||||
annotations[mkey] = stats
|
||||
newstat = true
|
||||
}
|
||||
} else {
|
||||
stats = make(StatsOnValues, 100)
|
||||
stats = make(StatsOnValues, 10)
|
||||
annotations[mkey] = stats
|
||||
newstat = true
|
||||
}
|
||||
@ -165,7 +167,7 @@ func (sequence *BioSequence) StatsPlusOne(desc StatsOnDescription, toAdd *BioSeq
|
||||
log.Fatalf("Trying to make stats on a float value (%v : %T)", value, value)
|
||||
}
|
||||
default:
|
||||
log.Fatalf("Trying to make stats on a none string, integer or boolean value (%v : %T)", value, value)
|
||||
log.Fatalf("Trying to make stats not on a string, integer or boolean value (%v : %T)", value, value)
|
||||
}
|
||||
retval = true
|
||||
}
|
||||
@ -234,7 +236,7 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
|
||||
if tomerge.HasAnnotation() {
|
||||
ma := tomerge.Annotations()
|
||||
for k, va := range annotations {
|
||||
if !strings.HasPrefix(k, "merged_") {
|
||||
if !strings.HasPrefix(k, _merge_prefix) {
|
||||
vm, ok := ma[k]
|
||||
if ok {
|
||||
switch vm := vm.(type) {
|
||||
@ -255,7 +257,7 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
|
||||
}
|
||||
} else {
|
||||
for k := range annotations {
|
||||
if !strings.HasPrefix(k, "merged_") {
|
||||
if !strings.HasPrefix(k, _merge_prefix) {
|
||||
delete(annotations, k)
|
||||
}
|
||||
}
|
||||
|
113
pkg/obistats/algo.go
Normal file
113
pkg/obistats/algo.go
Normal file
@ -0,0 +1,113 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
// Miscellaneous helper algorithms
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
func maxint(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func minint(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func sumint(xs []int) int {
|
||||
sum := 0
|
||||
for _, x := range xs {
|
||||
sum += x
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
// bisect returns an x in [low, high] such that |f(x)| <= tolerance
|
||||
// using the bisection method.
|
||||
//
|
||||
// f(low) and f(high) must have opposite signs.
|
||||
//
|
||||
// If f does not have a root in this interval (e.g., it is
|
||||
// discontiguous), this returns the X of the apparent discontinuity
|
||||
// and false.
|
||||
func bisect(f func(float64) float64, low, high, tolerance float64) (float64, bool) {
|
||||
flow, fhigh := f(low), f(high)
|
||||
if -tolerance <= flow && flow <= tolerance {
|
||||
return low, true
|
||||
}
|
||||
if -tolerance <= fhigh && fhigh <= tolerance {
|
||||
return high, true
|
||||
}
|
||||
if mathSign(flow) == mathSign(fhigh) {
|
||||
panic(fmt.Sprintf("root of f is not bracketed by [low, high]; f(%g)=%g f(%g)=%g", low, flow, high, fhigh))
|
||||
}
|
||||
for {
|
||||
mid := (high + low) / 2
|
||||
fmid := f(mid)
|
||||
if -tolerance <= fmid && fmid <= tolerance {
|
||||
return mid, true
|
||||
}
|
||||
if mid == high || mid == low {
|
||||
return mid, false
|
||||
}
|
||||
if mathSign(fmid) == mathSign(flow) {
|
||||
low = mid
|
||||
flow = fmid
|
||||
} else {
|
||||
high = mid
|
||||
fhigh = fmid
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// bisectBool implements the bisection method on a boolean function.
|
||||
// It returns x1, x2 ∈ [low, high], x1 < x2 such that f(x1) != f(x2)
|
||||
// and x2 - x1 <= xtol.
|
||||
//
|
||||
// If f(low) == f(high), it panics.
|
||||
func bisectBool(f func(float64) bool, low, high, xtol float64) (x1, x2 float64) {
|
||||
flow, fhigh := f(low), f(high)
|
||||
if flow == fhigh {
|
||||
panic(fmt.Sprintf("root of f is not bracketed by [low, high]; f(%g)=%v f(%g)=%v", low, flow, high, fhigh))
|
||||
}
|
||||
for {
|
||||
if high-low <= xtol {
|
||||
return low, high
|
||||
}
|
||||
mid := (high + low) / 2
|
||||
if mid == high || mid == low {
|
||||
return low, high
|
||||
}
|
||||
fmid := f(mid)
|
||||
if fmid == flow {
|
||||
low = mid
|
||||
flow = fmid
|
||||
} else {
|
||||
high = mid
|
||||
fhigh = fmid
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// series returns the sum of the series f(0), f(1), ...
|
||||
//
|
||||
// This implementation is fast, but subject to round-off error.
|
||||
func series(f func(float64) float64) float64 {
|
||||
y, yp := 0.0, 1.0
|
||||
for n := 0.0; y != yp; n++ {
|
||||
yp = y
|
||||
y += f(n)
|
||||
}
|
||||
return y
|
||||
}
|
93
pkg/obistats/beta.go
Normal file
93
pkg/obistats/beta.go
Normal file
@ -0,0 +1,93 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import "math"
|
||||
|
||||
func lgamma(x float64) float64 {
|
||||
y, _ := math.Lgamma(x)
|
||||
return y
|
||||
}
|
||||
|
||||
// mathBeta returns the value of the complete beta function B(a, b).
|
||||
func mathBeta(a, b float64) float64 {
|
||||
// B(x,y) = Γ(x)Γ(y) / Γ(x+y)
|
||||
return math.Exp(lgamma(a) + lgamma(b) - lgamma(a+b))
|
||||
}
|
||||
|
||||
// mathBetaInc returns the value of the regularized incomplete beta
|
||||
// function Iₓ(a, b).
|
||||
//
|
||||
// This is not to be confused with the "incomplete beta function",
|
||||
// which can be computed as BetaInc(x, a, b)*Beta(a, b).
|
||||
//
|
||||
// If x < 0 or x > 1, returns NaN.
|
||||
func mathBetaInc(x, a, b float64) float64 {
|
||||
// Based on Numerical Recipes in C, section 6.4. This uses the
|
||||
// continued fraction definition of I:
|
||||
//
|
||||
// (xᵃ*(1-x)ᵇ)/(a*B(a,b)) * (1/(1+(d₁/(1+(d₂/(1+...))))))
|
||||
//
|
||||
// where B(a,b) is the beta function and
|
||||
//
|
||||
// d_{2m+1} = -(a+m)(a+b+m)x/((a+2m)(a+2m+1))
|
||||
// d_{2m} = m(b-m)x/((a+2m-1)(a+2m))
|
||||
if x < 0 || x > 1 {
|
||||
return math.NaN()
|
||||
}
|
||||
bt := 0.0
|
||||
if 0 < x && x < 1 {
|
||||
// Compute the coefficient before the continued
|
||||
// fraction.
|
||||
bt = math.Exp(lgamma(a+b) - lgamma(a) - lgamma(b) +
|
||||
a*math.Log(x) + b*math.Log(1-x))
|
||||
}
|
||||
if x < (a+1)/(a+b+2) {
|
||||
// Compute continued fraction directly.
|
||||
return bt * betacf(x, a, b) / a
|
||||
}
|
||||
// Compute continued fraction after symmetry transform.
|
||||
return 1 - bt*betacf(1-x, b, a)/b
|
||||
}
|
||||
|
||||
// betacf is the continued fraction component of the regularized
|
||||
// incomplete beta function Iₓ(a, b).
|
||||
func betacf(x, a, b float64) float64 {
|
||||
const maxIterations = 200
|
||||
const epsilon = 3e-14
|
||||
|
||||
raiseZero := func(z float64) float64 {
|
||||
if math.Abs(z) < math.SmallestNonzeroFloat64 {
|
||||
return math.SmallestNonzeroFloat64
|
||||
}
|
||||
return z
|
||||
}
|
||||
|
||||
c := 1.0
|
||||
d := 1 / raiseZero(1-(a+b)*x/(a+1))
|
||||
h := d
|
||||
for m := 1; m <= maxIterations; m++ {
|
||||
mf := float64(m)
|
||||
|
||||
// Even step of the recurrence.
|
||||
numer := mf * (b - mf) * x / ((a + 2*mf - 1) * (a + 2*mf))
|
||||
d = 1 / raiseZero(1+numer*d)
|
||||
c = raiseZero(1 + numer/c)
|
||||
h *= d * c
|
||||
|
||||
// Odd step of the recurrence.
|
||||
numer = -(a + mf) * (a + b + mf) * x / ((a + 2*mf) * (a + 2*mf + 1))
|
||||
d = 1 / raiseZero(1+numer*d)
|
||||
c = raiseZero(1 + numer/c)
|
||||
hfac := d * c
|
||||
h *= hfac
|
||||
|
||||
if math.Abs(hfac-1) < epsilon {
|
||||
return h
|
||||
}
|
||||
}
|
||||
panic("betainc: a or b too big; failed to converge")
|
||||
}
|
225
pkg/obistats/data.go
Normal file
225
pkg/obistats/data.go
Normal file
@ -0,0 +1,225 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// A Collection is a collection of benchmark results.
|
||||
type Collection struct {
|
||||
// Configs, Groups, and Units give the set of configs,
|
||||
// groups, and units from the keys in Stats in an order
|
||||
// meant to match the order the benchmarks were read in.
|
||||
Configs, Groups, Units []string
|
||||
|
||||
// Benchmarks gives the set of benchmarks from the keys in
|
||||
// Stats by group in an order meant to match the order
|
||||
// benchmarks were read in.
|
||||
Benchmarks map[string][]string
|
||||
|
||||
// Metrics holds the accumulated metrics for each key.
|
||||
Metrics map[Key]*Metrics
|
||||
|
||||
// DeltaTest is the test to use to decide if a change is significant.
|
||||
// If nil, it defaults to UTest.
|
||||
DeltaTest DeltaTest
|
||||
|
||||
// Alpha is the p-value cutoff to report a change as significant.
|
||||
// If zero, it defaults to 0.05.
|
||||
Alpha float64
|
||||
|
||||
// AddGeoMean specifies whether to add a line to the table
|
||||
// showing the geometric mean of all the benchmark results.
|
||||
AddGeoMean bool
|
||||
|
||||
// SplitBy specifies the labels to split results by.
|
||||
// By default, results will only be split by full name.
|
||||
SplitBy []string
|
||||
|
||||
// Order specifies the row display order for this table.
|
||||
// If Order is nil, the table rows are printed in order of
|
||||
// first appearance in the input.
|
||||
Order Order
|
||||
}
|
||||
|
||||
// A Key identifies one metric (e.g., "ns/op", "B/op") from one
|
||||
// benchmark (function name sans "Benchmark" prefix) and optional
|
||||
// group in one configuration (input file name).
|
||||
type Key struct {
|
||||
Config, Group, Benchmark, Unit string
|
||||
}
|
||||
|
||||
// A Metrics holds the measurements of a single metric
|
||||
// (for example, ns/op or MB/s)
|
||||
// for all runs of a particular benchmark.
|
||||
type Metrics struct {
|
||||
Unit string // unit being measured
|
||||
Values []float64 // measured values
|
||||
RValues []float64 // Values with outliers removed
|
||||
Min float64 // min of RValues
|
||||
Mean float64 // mean of RValues
|
||||
Max float64 // max of RValues
|
||||
}
|
||||
|
||||
// FormatMean formats m.Mean using scaler.
|
||||
func (m *Metrics) FormatMean(scaler Scaler) string {
|
||||
var s string
|
||||
if scaler != nil {
|
||||
s = scaler(m.Mean)
|
||||
} else {
|
||||
s = fmt.Sprint(m.Mean)
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// FormatDiff computes and formats the percent variation of max and min compared to mean.
|
||||
// If b.Mean or b.Max is zero, FormatDiff returns an empty string.
|
||||
func (m *Metrics) FormatDiff() string {
|
||||
if m.Mean == 0 || m.Max == 0 {
|
||||
return ""
|
||||
}
|
||||
diff := 1 - m.Min/m.Mean
|
||||
if d := m.Max/m.Mean - 1; d > diff {
|
||||
diff = d
|
||||
}
|
||||
|
||||
return fmt.Sprintf("±%.0f%%", diff*100.0)
|
||||
}
|
||||
|
||||
// Format returns a textual formatting of "Mean ±Diff" using scaler.
|
||||
func (m *Metrics) Format(scaler Scaler) string {
|
||||
if m.Unit == "" {
|
||||
return ""
|
||||
}
|
||||
mean := m.FormatMean(scaler)
|
||||
diff := m.FormatDiff()
|
||||
if diff == "" {
|
||||
return mean + " "
|
||||
}
|
||||
return fmt.Sprintf("%s %3s", mean, diff)
|
||||
}
|
||||
|
||||
// computeStats updates the derived statistics in m from the raw
|
||||
// samples in m.Values.
|
||||
func (m *Metrics) computeStats() {
|
||||
// Discard outliers.
|
||||
values := Sample{Xs: m.Values}
|
||||
q1, q3 := values.Percentile(0.25), values.Percentile(0.75)
|
||||
lo, hi := q1-1.5*(q3-q1), q3+1.5*(q3-q1)
|
||||
for _, value := range m.Values {
|
||||
if lo <= value && value <= hi {
|
||||
m.RValues = append(m.RValues, value)
|
||||
}
|
||||
}
|
||||
|
||||
// Compute statistics of remaining data.
|
||||
m.Min, m.Max = Bounds(m.RValues)
|
||||
m.Mean = Mean(m.RValues)
|
||||
}
|
||||
|
||||
// addMetrics returns the metrics with the given key from c,
|
||||
// creating a new one if needed.
|
||||
func (c *Collection) addMetrics(key Key) *Metrics {
|
||||
if c.Metrics == nil {
|
||||
c.Metrics = make(map[Key]*Metrics)
|
||||
}
|
||||
if stat, ok := c.Metrics[key]; ok {
|
||||
return stat
|
||||
}
|
||||
|
||||
addString := func(strings *[]string, add string) {
|
||||
for _, s := range *strings {
|
||||
if s == add {
|
||||
return
|
||||
}
|
||||
}
|
||||
*strings = append(*strings, add)
|
||||
}
|
||||
addString(&c.Configs, key.Config)
|
||||
addString(&c.Groups, key.Group)
|
||||
if c.Benchmarks == nil {
|
||||
c.Benchmarks = make(map[string][]string)
|
||||
}
|
||||
benchmarks := c.Benchmarks[key.Group]
|
||||
addString(&benchmarks, key.Benchmark)
|
||||
c.Benchmarks[key.Group] = benchmarks
|
||||
addString(&c.Units, key.Unit)
|
||||
m := &Metrics{Unit: key.Unit}
|
||||
c.Metrics[key] = m
|
||||
return m
|
||||
}
|
||||
|
||||
// // AddFile adds the benchmark results in the formatted data
|
||||
// // (read from the reader r) to the named configuration.
|
||||
// func (c *Collection) AddFile(config string, f io.Reader) error {
|
||||
// c.Configs = append(c.Configs, config)
|
||||
// key := Key{Config: config}
|
||||
// br := benchfmt.NewReader(f)
|
||||
// for br.Next() {
|
||||
// c.addResult(key, br.Result())
|
||||
// }
|
||||
// return br.Err()
|
||||
// }
|
||||
|
||||
// // AddData adds the benchmark results in the formatted data
|
||||
// // (read from the reader r) to the named configuration.
|
||||
// func (c *Collection) AddData(config string, data []byte) error {
|
||||
// return c.AddFile(config, bytes.NewReader(data))
|
||||
// }
|
||||
|
||||
// AddResults adds the benchmark results to the named configuration.
|
||||
// func (c *Collection) AddResults(config string, results []*benchfmt.Result) {
|
||||
// c.Configs = append(c.Configs, config)
|
||||
// key := Key{Config: config}
|
||||
// for _, r := range results {
|
||||
// c.addResult(key, r)
|
||||
// }
|
||||
// }
|
||||
|
||||
// func (c *Collection) addResult(key Key, r *benchfmt.Result) {
|
||||
// f := strings.Fields(r.Content)
|
||||
// if len(f) < 4 {
|
||||
// return
|
||||
// }
|
||||
// name := f[0]
|
||||
// if !strings.HasPrefix(name, "Benchmark") {
|
||||
// return
|
||||
// }
|
||||
// name = strings.TrimPrefix(name, "Benchmark")
|
||||
// n, _ := strconv.Atoi(f[1])
|
||||
// if n == 0 {
|
||||
// return
|
||||
// }
|
||||
// key.Group = c.makeGroup(r)
|
||||
// key.Benchmark = name
|
||||
// for i := 2; i+2 <= len(f); i += 2 {
|
||||
// val, err := strconv.ParseFloat(f[i], 64)
|
||||
// if err != nil {
|
||||
// continue
|
||||
// }
|
||||
// key.Unit = f[i+1]
|
||||
// m := c.addMetrics(key)
|
||||
// m.Values = append(m.Values, val)
|
||||
// }
|
||||
// }
|
||||
|
||||
// func (c *Collection) makeGroup(r *benchfmt.Result) string {
|
||||
// var out string
|
||||
// for _, s := range c.SplitBy {
|
||||
// v := r.NameLabels[s]
|
||||
// if v == "" {
|
||||
// v = r.Labels[s]
|
||||
// }
|
||||
// if v != "" {
|
||||
// if out != "" {
|
||||
// out = out + " "
|
||||
// }
|
||||
// out += fmt.Sprintf("%s:%s", s, v)
|
||||
// }
|
||||
// }
|
||||
// return out
|
||||
// }
|
54
pkg/obistats/delta.go
Normal file
54
pkg/obistats/delta.go
Normal file
@ -0,0 +1,54 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import "errors"
|
||||
|
||||
// A DeltaTest compares the old and new metrics and returns the
|
||||
// expected probability that they are drawn from the same distribution.
|
||||
//
|
||||
// If a probability cannot be computed, the DeltaTest returns an
|
||||
// error explaining why. Common errors include ErrSamplesEqual
|
||||
// (all samples are equal), ErrSampleSize (there aren't enough samples),
|
||||
// and ErrZeroVariance (the sample has zero variance).
|
||||
//
|
||||
// As a special case, the missing test NoDeltaTest returns -1, nil.
|
||||
type DeltaTest func(old, new *Metrics) (float64, error)
|
||||
|
||||
// Errors returned by DeltaTest.
|
||||
var (
|
||||
ErrSamplesEqual = errors.New("all equal")
|
||||
ErrSampleSize = errors.New("too few samples")
|
||||
ErrZeroVariance = errors.New("zero variance")
|
||||
ErrMismatchedSamples = errors.New("samples have different lengths")
|
||||
)
|
||||
|
||||
// NoDeltaTest applies no delta test; it returns -1, nil.
|
||||
func NoDeltaTest(old, new *Metrics) (pval float64, err error) {
|
||||
return -1, nil
|
||||
}
|
||||
|
||||
// TTest is a DeltaTest using the two-sample Welch t-test.
|
||||
func TTest(old, new *Metrics) (pval float64, err error) {
|
||||
t, err := TwoSampleWelchTTest(
|
||||
Sample{Xs: old.RValues},
|
||||
Sample{Xs: new.RValues},
|
||||
LocationDiffers,
|
||||
)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
return t.P, nil
|
||||
}
|
||||
|
||||
// UTest is a DeltaTest using the Mann-Whitney U test.
|
||||
func UTest(old, new *Metrics) (pval float64, err error) {
|
||||
u, err := MannWhitneyUTest(old.RValues, new.RValues, LocationDiffers)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
return u.P, nil
|
||||
}
|
275
pkg/obistats/mannwhitney.go
Normal file
275
pkg/obistats/mannwhitney.go
Normal file
@ -0,0 +1,275 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// A LocationHypothesis specifies the alternative hypothesis of a
|
||||
// location test such as a t-test or a Mann-Whitney U-test. The
|
||||
// default (zero) value is to test against the alternative hypothesis
|
||||
// that they differ.
|
||||
type LocationHypothesis int
|
||||
|
||||
//go:generate stringer -type LocationHypothesis
|
||||
|
||||
const (
|
||||
// LocationLess specifies the alternative hypothesis that the
|
||||
// location of the first sample is less than the second. This
|
||||
// is a one-tailed test.
|
||||
LocationLess LocationHypothesis = -1
|
||||
|
||||
// LocationDiffers specifies the alternative hypothesis that
|
||||
// the locations of the two samples are not equal. This is a
|
||||
// two-tailed test.
|
||||
LocationDiffers LocationHypothesis = 0
|
||||
|
||||
// LocationGreater specifies the alternative hypothesis that
|
||||
// the location of the first sample is greater than the
|
||||
// second. This is a one-tailed test.
|
||||
LocationGreater LocationHypothesis = 1
|
||||
)
|
||||
|
||||
// A MannWhitneyUTestResult is the result of a Mann-Whitney U-test.
|
||||
type MannWhitneyUTestResult struct {
|
||||
// N1 and N2 are the sizes of the input samples.
|
||||
N1, N2 int
|
||||
|
||||
// U is the value of the Mann-Whitney U statistic for this
|
||||
// test, generalized by counting ties as 0.5.
|
||||
//
|
||||
// Given the Cartesian product of the two samples, this is the
|
||||
// number of pairs in which the value from the first sample is
|
||||
// greater than the value of the second, plus 0.5 times the
|
||||
// number of pairs where the values from the two samples are
|
||||
// equal. Hence, U is always an integer multiple of 0.5 (it is
|
||||
// a whole integer if there are no ties) in the range [0, N1*N2].
|
||||
//
|
||||
// U statistics always come in pairs, depending on which
|
||||
// sample is "first". The mirror U for the other sample can be
|
||||
// calculated as N1*N2 - U.
|
||||
//
|
||||
// There are many equivalent statistics with slightly
|
||||
// different definitions. The Wilcoxon (1945) W statistic
|
||||
// (generalized for ties) is U + (N1(N1+1))/2. It is also
|
||||
// common to use 2U to eliminate the half steps and Smid
|
||||
// (1956) uses N1*N2 - 2U to additionally center the
|
||||
// distribution.
|
||||
U float64
|
||||
|
||||
// AltHypothesis specifies the alternative hypothesis tested
|
||||
// by this test against the null hypothesis that there is no
|
||||
// difference in the locations of the samples.
|
||||
AltHypothesis LocationHypothesis
|
||||
|
||||
// P is the p-value of the Mann-Whitney test for the given
|
||||
// null hypothesis.
|
||||
P float64
|
||||
}
|
||||
|
||||
// MannWhitneyExactLimit gives the largest sample size for which the
|
||||
// exact U distribution will be used for the Mann-Whitney U-test.
|
||||
//
|
||||
// Using the exact distribution is necessary for small sample sizes
|
||||
// because the distribution is highly irregular. However, computing
|
||||
// the distribution for large sample sizes is both computationally
|
||||
// expensive and unnecessary because it quickly approaches a normal
|
||||
// approximation. Computing the distribution for two 50 value samples
|
||||
// takes a few milliseconds on a 2014 laptop.
|
||||
var MannWhitneyExactLimit = 50
|
||||
|
||||
// MannWhitneyTiesExactLimit gives the largest sample size for which
|
||||
// the exact U distribution will be used for the Mann-Whitney U-test
|
||||
// in the presence of ties.
|
||||
//
|
||||
// Computing this distribution is more expensive than computing the
|
||||
// distribution without ties, so this is set lower. Computing this
|
||||
// distribution for two 25 value samples takes about ten milliseconds
|
||||
// on a 2014 laptop.
|
||||
var MannWhitneyTiesExactLimit = 25
|
||||
|
||||
// MannWhitneyUTest performs a Mann-Whitney U-test [1,2] of the null
|
||||
// hypothesis that two samples come from the same population against
|
||||
// the alternative hypothesis that one sample tends to have larger or
|
||||
// smaller values than the other.
|
||||
//
|
||||
// This is similar to a t-test, but unlike the t-test, the
|
||||
// Mann-Whitney U-test is non-parametric (it does not assume a normal
|
||||
// distribution). It has very slightly lower efficiency than the
|
||||
// t-test on normal distributions.
|
||||
//
|
||||
// Computing the exact U distribution is expensive for large sample
|
||||
// sizes, so this uses a normal approximation for sample sizes larger
|
||||
// than MannWhitneyExactLimit if there are no ties or
|
||||
// MannWhitneyTiesExactLimit if there are ties. This normal
|
||||
// approximation uses both the tie correction and the continuity
|
||||
// correction.
|
||||
//
|
||||
// This can fail with ErrSampleSize if either sample is empty or
|
||||
// ErrSamplesEqual if all sample values are equal.
|
||||
//
|
||||
// This is also known as a Mann-Whitney-Wilcoxon test and is
|
||||
// equivalent to the Wilcoxon rank-sum test, though the Wilcoxon
|
||||
// rank-sum test differs in nomenclature.
|
||||
//
|
||||
// [1] Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of
|
||||
// Whether one of Two Random Variables is Stochastically Larger than
|
||||
// the Other". Annals of Mathematical Statistics 18 (1): 50–60.
|
||||
//
|
||||
// [2] Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer".
|
||||
// Journal of the American Statistical Association 61 (315): 772-787.
|
||||
func MannWhitneyUTest(x1, x2 []float64, alt LocationHypothesis) (*MannWhitneyUTestResult, error) {
|
||||
n1, n2 := len(x1), len(x2)
|
||||
if n1 == 0 || n2 == 0 {
|
||||
return nil, ErrSampleSize
|
||||
}
|
||||
|
||||
// Compute the U statistic and tie vector T.
|
||||
x1 = append([]float64(nil), x1...)
|
||||
x2 = append([]float64(nil), x2...)
|
||||
sort.Float64s(x1)
|
||||
sort.Float64s(x2)
|
||||
merged, labels := labeledMerge(x1, x2)
|
||||
|
||||
R1 := 0.0
|
||||
T, hasTies := []int{}, false
|
||||
for i := 0; i < len(merged); {
|
||||
rank1, nx1, v1 := i+1, 0, merged[i]
|
||||
// Consume samples that tie this sample (including itself).
|
||||
for ; i < len(merged) && merged[i] == v1; i++ {
|
||||
if labels[i] == 1 {
|
||||
nx1++
|
||||
}
|
||||
}
|
||||
// Assign all tied samples the average rank of the
|
||||
// samples, where merged[0] has rank 1.
|
||||
if nx1 != 0 {
|
||||
rank := float64(i+rank1) / 2
|
||||
R1 += rank * float64(nx1)
|
||||
}
|
||||
T = append(T, i-rank1+1)
|
||||
if i > rank1 {
|
||||
hasTies = true
|
||||
}
|
||||
}
|
||||
U1 := R1 - float64(n1*(n1+1))/2
|
||||
|
||||
// Compute the smaller of U1 and U2
|
||||
U2 := float64(n1*n2) - U1
|
||||
Usmall := math.Min(U1, U2)
|
||||
|
||||
var p float64
|
||||
if !hasTies && n1 <= MannWhitneyExactLimit && n2 <= MannWhitneyExactLimit ||
|
||||
hasTies && n1 <= MannWhitneyTiesExactLimit && n2 <= MannWhitneyTiesExactLimit {
|
||||
// Use exact U distribution. U1 will be an integer.
|
||||
if len(T) == 1 {
|
||||
// All values are equal. Test is meaningless.
|
||||
return nil, ErrSamplesEqual
|
||||
}
|
||||
|
||||
dist := UDist{N1: n1, N2: n2, T: T}
|
||||
switch alt {
|
||||
case LocationDiffers:
|
||||
if U1 == U2 {
|
||||
// The distribution is symmetric about
|
||||
// Usmall. Since the distribution is
|
||||
// discrete, the CDF is discontinuous
|
||||
// and if simply double CDF(Usmall),
|
||||
// we'll double count the
|
||||
// (non-infinitesimal) probability
|
||||
// mass at Usmall. What we want is
|
||||
// just the integral of the whole CDF,
|
||||
// which is 1.
|
||||
p = 1
|
||||
} else {
|
||||
p = dist.CDF(Usmall) * 2
|
||||
}
|
||||
|
||||
case LocationLess:
|
||||
p = dist.CDF(U1)
|
||||
|
||||
case LocationGreater:
|
||||
p = 1 - dist.CDF(U1-1)
|
||||
}
|
||||
} else {
|
||||
// Use normal approximation (with tie and continuity
|
||||
// correction).
|
||||
t := tieCorrection(T)
|
||||
N := float64(n1 + n2)
|
||||
μU := float64(n1*n2) / 2
|
||||
σU := math.Sqrt(float64(n1*n2) * ((N + 1) - t/(N*(N-1))) / 12)
|
||||
if σU == 0 {
|
||||
return nil, ErrSamplesEqual
|
||||
}
|
||||
numer := U1 - μU
|
||||
// Perform continuity correction.
|
||||
switch alt {
|
||||
case LocationDiffers:
|
||||
numer -= mathSign(numer) * 0.5
|
||||
case LocationLess:
|
||||
numer += 0.5
|
||||
case LocationGreater:
|
||||
numer -= 0.5
|
||||
}
|
||||
z := numer / σU
|
||||
switch alt {
|
||||
case LocationDiffers:
|
||||
p = 2 * math.Min(StdNormal.CDF(z), 1-StdNormal.CDF(z))
|
||||
case LocationLess:
|
||||
p = StdNormal.CDF(z)
|
||||
case LocationGreater:
|
||||
p = 1 - StdNormal.CDF(z)
|
||||
}
|
||||
}
|
||||
|
||||
return &MannWhitneyUTestResult{N1: n1, N2: n2, U: U1,
|
||||
AltHypothesis: alt, P: p}, nil
|
||||
}
|
||||
|
||||
// labeledMerge merges sorted lists x1 and x2 into sorted list merged.
|
||||
// labels[i] is 1 or 2 depending on whether merged[i] is a value from
|
||||
// x1 or x2, respectively.
|
||||
func labeledMerge(x1, x2 []float64) (merged []float64, labels []byte) {
|
||||
merged = make([]float64, len(x1)+len(x2))
|
||||
labels = make([]byte, len(x1)+len(x2))
|
||||
|
||||
i, j, o := 0, 0, 0
|
||||
for i < len(x1) && j < len(x2) {
|
||||
if x1[i] < x2[j] {
|
||||
merged[o] = x1[i]
|
||||
labels[o] = 1
|
||||
i++
|
||||
} else {
|
||||
merged[o] = x2[j]
|
||||
labels[o] = 2
|
||||
j++
|
||||
}
|
||||
o++
|
||||
}
|
||||
for ; i < len(x1); i++ {
|
||||
merged[o] = x1[i]
|
||||
labels[o] = 1
|
||||
o++
|
||||
}
|
||||
for ; j < len(x2); j++ {
|
||||
merged[o] = x2[j]
|
||||
labels[o] = 2
|
||||
o++
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// tieCorrection computes the tie correction factor Σ_j (t_j³ - t_j)
|
||||
// where t_j is the number of ties in the j'th rank.
|
||||
func tieCorrection(ties []int) float64 {
|
||||
t := 0
|
||||
for _, tie := range ties {
|
||||
t += tie*tie*tie - tie
|
||||
}
|
||||
return float64(t)
|
||||
}
|
79
pkg/obistats/mathx.go
Normal file
79
pkg/obistats/mathx.go
Normal file
@ -0,0 +1,79 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import "math"
|
||||
|
||||
var inf = math.Inf(1)
|
||||
var nan = math.NaN()
|
||||
|
||||
// mathSign returns the sign of x: -1 if x < 0, 0 if x == 0, 1 if x > 0.
|
||||
// If x is NaN, it returns NaN.
|
||||
func mathSign(x float64) float64 {
|
||||
if x == 0 {
|
||||
return 0
|
||||
} else if x < 0 {
|
||||
return -1
|
||||
} else if x > 0 {
|
||||
return 1
|
||||
}
|
||||
return nan
|
||||
}
|
||||
|
||||
const smallFactLimit = 20 // 20! => 62 bits
|
||||
var smallFact [smallFactLimit + 1]int64
|
||||
|
||||
func init() {
|
||||
smallFact[0] = 1
|
||||
fact := int64(1)
|
||||
for n := int64(1); n <= smallFactLimit; n++ {
|
||||
fact *= n
|
||||
smallFact[n] = fact
|
||||
}
|
||||
}
|
||||
|
||||
// mathChoose returns the binomial coefficient of n and k.
|
||||
func mathChoose(n, k int) float64 {
|
||||
if k == 0 || k == n {
|
||||
return 1
|
||||
}
|
||||
if k < 0 || n < k {
|
||||
return 0
|
||||
}
|
||||
if n <= smallFactLimit { // Implies k <= smallFactLimit
|
||||
// It's faster to do several integer multiplications
|
||||
// than it is to do an extra integer division.
|
||||
// Remarkably, this is also faster than pre-computing
|
||||
// Pascal's triangle (presumably because this is very
|
||||
// cache efficient).
|
||||
numer := int64(1)
|
||||
for n1 := int64(n - (k - 1)); n1 <= int64(n); n1++ {
|
||||
numer *= n1
|
||||
}
|
||||
denom := smallFact[k]
|
||||
return float64(numer / denom)
|
||||
}
|
||||
|
||||
return math.Exp(lchoose(n, k))
|
||||
}
|
||||
|
||||
// mathLchoose returns math.Log(mathChoose(n, k)).
|
||||
func mathLchoose(n, k int) float64 {
|
||||
if k == 0 || k == n {
|
||||
return 0
|
||||
}
|
||||
if k < 0 || n < k {
|
||||
return math.NaN()
|
||||
}
|
||||
return lchoose(n, k)
|
||||
}
|
||||
|
||||
func lchoose(n, k int) float64 {
|
||||
a, _ := math.Lgamma(float64(n + 1))
|
||||
b, _ := math.Lgamma(float64(k + 1))
|
||||
c, _ := math.Lgamma(float64(n - k + 1))
|
||||
return a - b - c
|
||||
}
|
142
pkg/obistats/normaldist.go
Normal file
142
pkg/obistats/normaldist.go
Normal file
@ -0,0 +1,142 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import (
|
||||
"math"
|
||||
"math/rand"
|
||||
)
|
||||
|
||||
// NormalDist is a normal (Gaussian) distribution with mean Mu and
|
||||
// standard deviation Sigma.
|
||||
type NormalDist struct {
|
||||
Mu, Sigma float64
|
||||
}
|
||||
|
||||
// StdNormal is the standard normal distribution (Mu = 0, Sigma = 1)
|
||||
var StdNormal = NormalDist{0, 1}
|
||||
|
||||
// 1/sqrt(2 * pi)
|
||||
const invSqrt2Pi = 0.39894228040143267793994605993438186847585863116493465766592583
|
||||
|
||||
func (n NormalDist) PDF(x float64) float64 {
|
||||
z := x - n.Mu
|
||||
return math.Exp(-z*z/(2*n.Sigma*n.Sigma)) * invSqrt2Pi / n.Sigma
|
||||
}
|
||||
|
||||
func (n NormalDist) pdfEach(xs []float64) []float64 {
|
||||
res := make([]float64, len(xs))
|
||||
if n.Mu == 0 && n.Sigma == 1 {
|
||||
// Standard normal fast path
|
||||
for i, x := range xs {
|
||||
res[i] = math.Exp(-x*x/2) * invSqrt2Pi
|
||||
}
|
||||
} else {
|
||||
a := -1 / (2 * n.Sigma * n.Sigma)
|
||||
b := invSqrt2Pi / n.Sigma
|
||||
for i, x := range xs {
|
||||
z := x - n.Mu
|
||||
res[i] = math.Exp(z*z*a) * b
|
||||
}
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func (n NormalDist) CDF(x float64) float64 {
|
||||
return math.Erfc(-(x-n.Mu)/(n.Sigma*math.Sqrt2)) / 2
|
||||
}
|
||||
|
||||
func (n NormalDist) cdfEach(xs []float64) []float64 {
|
||||
res := make([]float64, len(xs))
|
||||
a := 1 / (n.Sigma * math.Sqrt2)
|
||||
for i, x := range xs {
|
||||
res[i] = math.Erfc(-(x-n.Mu)*a) / 2
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func (n NormalDist) InvCDF(p float64) (x float64) {
|
||||
// This is based on Peter John Acklam's inverse normal CDF
|
||||
// algorithm: http://home.online.no/~pjacklam/notes/invnorm/
|
||||
const (
|
||||
a1 = -3.969683028665376e+01
|
||||
a2 = 2.209460984245205e+02
|
||||
a3 = -2.759285104469687e+02
|
||||
a4 = 1.383577518672690e+02
|
||||
a5 = -3.066479806614716e+01
|
||||
a6 = 2.506628277459239e+00
|
||||
|
||||
b1 = -5.447609879822406e+01
|
||||
b2 = 1.615858368580409e+02
|
||||
b3 = -1.556989798598866e+02
|
||||
b4 = 6.680131188771972e+01
|
||||
b5 = -1.328068155288572e+01
|
||||
|
||||
c1 = -7.784894002430293e-03
|
||||
c2 = -3.223964580411365e-01
|
||||
c3 = -2.400758277161838e+00
|
||||
c4 = -2.549732539343734e+00
|
||||
c5 = 4.374664141464968e+00
|
||||
c6 = 2.938163982698783e+00
|
||||
|
||||
d1 = 7.784695709041462e-03
|
||||
d2 = 3.224671290700398e-01
|
||||
d3 = 2.445134137142996e+00
|
||||
d4 = 3.754408661907416e+00
|
||||
|
||||
plow = 0.02425
|
||||
phigh = 1 - plow
|
||||
)
|
||||
|
||||
if p < 0 || p > 1 {
|
||||
return nan
|
||||
} else if p == 0 {
|
||||
return -inf
|
||||
} else if p == 1 {
|
||||
return inf
|
||||
}
|
||||
|
||||
if p < plow {
|
||||
// Rational approximation for lower region.
|
||||
q := math.Sqrt(-2 * math.Log(p))
|
||||
x = (((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q + c6) /
|
||||
((((d1*q+d2)*q+d3)*q+d4)*q + 1)
|
||||
} else if phigh < p {
|
||||
// Rational approximation for upper region.
|
||||
q := math.Sqrt(-2 * math.Log(1-p))
|
||||
x = -(((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q + c6) /
|
||||
((((d1*q+d2)*q+d3)*q+d4)*q + 1)
|
||||
} else {
|
||||
// Rational approximation for central region.
|
||||
q := p - 0.5
|
||||
r := q * q
|
||||
x = (((((a1*r+a2)*r+a3)*r+a4)*r+a5)*r + a6) * q /
|
||||
(((((b1*r+b2)*r+b3)*r+b4)*r+b5)*r + 1)
|
||||
}
|
||||
|
||||
// Refine approximation.
|
||||
e := 0.5*math.Erfc(-x/math.Sqrt2) - p
|
||||
u := e * math.Sqrt(2*math.Pi) * math.Exp(x*x/2)
|
||||
x = x - u/(1+x*u/2)
|
||||
|
||||
// Adjust from standard normal.
|
||||
return x*n.Sigma + n.Mu
|
||||
}
|
||||
|
||||
func (n NormalDist) Rand(r *rand.Rand) float64 {
|
||||
var x float64
|
||||
if r == nil {
|
||||
x = rand.NormFloat64()
|
||||
} else {
|
||||
x = r.NormFloat64()
|
||||
}
|
||||
return x*n.Sigma + n.Mu
|
||||
}
|
||||
|
||||
func (n NormalDist) Bounds() (float64, float64) {
|
||||
const stddevs = 3
|
||||
return n.Mu - stddevs*n.Sigma, n.Mu + stddevs*n.Sigma
|
||||
}
|
341
pkg/obistats/sample.go
Normal file
341
pkg/obistats/sample.go
Normal file
@ -0,0 +1,341 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// Sample is a collection of possibly weighted data points.
|
||||
type Sample struct {
|
||||
// Xs is the slice of sample values.
|
||||
Xs []float64
|
||||
|
||||
// Weights[i] is the weight of sample Xs[i]. If Weights is
|
||||
// nil, all Xs have weight 1. Weights must have the same
|
||||
// length of Xs and all values must be non-negative.
|
||||
Weights []float64
|
||||
|
||||
// Sorted indicates that Xs is sorted in ascending order.
|
||||
Sorted bool
|
||||
}
|
||||
|
||||
// Bounds returns the minimum and maximum values of xs.
|
||||
func Bounds(xs []float64) (min float64, max float64) {
|
||||
if len(xs) == 0 {
|
||||
return math.NaN(), math.NaN()
|
||||
}
|
||||
min, max = xs[0], xs[0]
|
||||
for _, x := range xs {
|
||||
if x < min {
|
||||
min = x
|
||||
}
|
||||
if x > max {
|
||||
max = x
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Bounds returns the minimum and maximum values of the Sample.
|
||||
//
|
||||
// If the Sample is weighted, this ignores samples with zero weight.
|
||||
//
|
||||
// This is constant time if s.Sorted and there are no zero-weighted
|
||||
// values.
|
||||
func (s Sample) Bounds() (min float64, max float64) {
|
||||
if len(s.Xs) == 0 || (!s.Sorted && s.Weights == nil) {
|
||||
return Bounds(s.Xs)
|
||||
}
|
||||
|
||||
if s.Sorted {
|
||||
if s.Weights == nil {
|
||||
return s.Xs[0], s.Xs[len(s.Xs)-1]
|
||||
}
|
||||
min, max = math.NaN(), math.NaN()
|
||||
for i, w := range s.Weights {
|
||||
if w != 0 {
|
||||
min = s.Xs[i]
|
||||
break
|
||||
}
|
||||
}
|
||||
if math.IsNaN(min) {
|
||||
return
|
||||
}
|
||||
for i := range s.Weights {
|
||||
if s.Weights[len(s.Weights)-i-1] != 0 {
|
||||
max = s.Xs[len(s.Weights)-i-1]
|
||||
break
|
||||
}
|
||||
}
|
||||
} else {
|
||||
min, max = math.Inf(1), math.Inf(-1)
|
||||
for i, x := range s.Xs {
|
||||
w := s.Weights[i]
|
||||
if x < min && w != 0 {
|
||||
min = x
|
||||
}
|
||||
if x > max && w != 0 {
|
||||
max = x
|
||||
}
|
||||
}
|
||||
if math.IsInf(min, 0) {
|
||||
min, max = math.NaN(), math.NaN()
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// vecSum returns the sum of xs.
|
||||
func vecSum(xs []float64) float64 {
|
||||
sum := 0.0
|
||||
for _, x := range xs {
|
||||
sum += x
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
// Sum returns the (possibly weighted) sum of the Sample.
|
||||
func (s Sample) Sum() float64 {
|
||||
if s.Weights == nil {
|
||||
return vecSum(s.Xs)
|
||||
}
|
||||
sum := 0.0
|
||||
for i, x := range s.Xs {
|
||||
sum += x * s.Weights[i]
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
// Weight returns the total weight of the Sasmple.
|
||||
func (s Sample) Weight() float64 {
|
||||
if s.Weights == nil {
|
||||
return float64(len(s.Xs))
|
||||
}
|
||||
return vecSum(s.Weights)
|
||||
}
|
||||
|
||||
// // Mean returns the arithmetic mean of xs.
|
||||
// func Mean(xs []float64) float64 {
|
||||
// if len(xs) == 0 {
|
||||
// return math.NaN()
|
||||
// }
|
||||
// m := 0.0
|
||||
// for i, x := range xs {
|
||||
// m += (x - m) / float64(i+1)
|
||||
// }
|
||||
// return m
|
||||
// }
|
||||
|
||||
// Mean returns the arithmetic mean of the Sample.
|
||||
func (s Sample) Mean() float64 {
|
||||
if len(s.Xs) == 0 || s.Weights == nil {
|
||||
return Mean(s.Xs)
|
||||
}
|
||||
|
||||
m, wsum := 0.0, 0.0
|
||||
for i, x := range s.Xs {
|
||||
// Use weighted incremental mean:
|
||||
// m_i = (1 - w_i/wsum_i) * m_(i-1) + (w_i/wsum_i) * x_i
|
||||
// = m_(i-1) + (x_i - m_(i-1)) * (w_i/wsum_i)
|
||||
w := s.Weights[i]
|
||||
wsum += w
|
||||
m += (x - m) * w / wsum
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// GeoMean returns the geometric mean of xs. xs must be positive.
|
||||
func GeoMean(xs []float64) float64 {
|
||||
if len(xs) == 0 {
|
||||
return math.NaN()
|
||||
}
|
||||
m := 0.0
|
||||
for i, x := range xs {
|
||||
if x <= 0 {
|
||||
return math.NaN()
|
||||
}
|
||||
lx := math.Log(x)
|
||||
m += (lx - m) / float64(i+1)
|
||||
}
|
||||
return math.Exp(m)
|
||||
}
|
||||
|
||||
// GeoMean returns the geometric mean of the Sample. All samples
|
||||
// values must be positive.
|
||||
func (s Sample) GeoMean() float64 {
|
||||
if len(s.Xs) == 0 || s.Weights == nil {
|
||||
return GeoMean(s.Xs)
|
||||
}
|
||||
|
||||
m, wsum := 0.0, 0.0
|
||||
for i, x := range s.Xs {
|
||||
w := s.Weights[i]
|
||||
wsum += w
|
||||
lx := math.Log(x)
|
||||
m += (lx - m) * w / wsum
|
||||
}
|
||||
return math.Exp(m)
|
||||
}
|
||||
|
||||
// Variance returns the sample variance of xs.
|
||||
func Variance(xs []float64) float64 {
|
||||
if len(xs) == 0 {
|
||||
return math.NaN()
|
||||
} else if len(xs) <= 1 {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Based on Wikipedia's presentation of Welford 1962
|
||||
// (http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm).
|
||||
// This is more numerically stable than the standard two-pass
|
||||
// formula and not prone to massive cancellation.
|
||||
mean, M2 := 0.0, 0.0
|
||||
for n, x := range xs {
|
||||
delta := x - mean
|
||||
mean += delta / float64(n+1)
|
||||
M2 += delta * (x - mean)
|
||||
}
|
||||
return M2 / float64(len(xs)-1)
|
||||
}
|
||||
|
||||
// Variance returns the sample variance of xs.
|
||||
func (s Sample) Variance() float64 {
|
||||
if len(s.Xs) == 0 || s.Weights == nil {
|
||||
return Variance(s.Xs)
|
||||
}
|
||||
// TODO(austin)
|
||||
panic("Weighted Variance not implemented")
|
||||
}
|
||||
|
||||
// StdDev returns the sample standard deviation of xs.
|
||||
func StdDev(xs []float64) float64 {
|
||||
return math.Sqrt(Variance(xs))
|
||||
}
|
||||
|
||||
// StdDev returns the sample standard deviation of the Sample.
|
||||
func (s Sample) StdDev() float64 {
|
||||
if len(s.Xs) == 0 || s.Weights == nil {
|
||||
return StdDev(s.Xs)
|
||||
}
|
||||
// TODO(austin)
|
||||
panic("Weighted StdDev not implemented")
|
||||
}
|
||||
|
||||
// Percentile returns the pctileth value from the Sample. This uses
|
||||
// interpolation method R8 from Hyndman and Fan (1996).
|
||||
//
|
||||
// pctile will be capped to the range [0, 1]. If len(xs) == 0 or all
|
||||
// weights are 0, returns NaN.
|
||||
//
|
||||
// Percentile(0.5) is the median. Percentile(0.25) and
|
||||
// Percentile(0.75) are the first and third quartiles, respectively.
|
||||
//
|
||||
// This is constant time if s.Sorted and s.Weights == nil.
|
||||
func (s Sample) Percentile(pctile float64) float64 {
|
||||
if len(s.Xs) == 0 {
|
||||
return math.NaN()
|
||||
} else if pctile <= 0 {
|
||||
min, _ := s.Bounds()
|
||||
return min
|
||||
} else if pctile >= 1 {
|
||||
_, max := s.Bounds()
|
||||
return max
|
||||
}
|
||||
|
||||
if !s.Sorted {
|
||||
// TODO(austin) Use select algorithm instead
|
||||
s = *s.Copy().Sort()
|
||||
}
|
||||
|
||||
if s.Weights == nil {
|
||||
N := float64(len(s.Xs))
|
||||
//n := pctile * (N + 1) // R6
|
||||
n := 1/3.0 + pctile*(N+1/3.0) // R8
|
||||
kf, frac := math.Modf(n)
|
||||
k := int(kf)
|
||||
if k <= 0 {
|
||||
return s.Xs[0]
|
||||
} else if k >= len(s.Xs) {
|
||||
return s.Xs[len(s.Xs)-1]
|
||||
}
|
||||
return s.Xs[k-1] + frac*(s.Xs[k]-s.Xs[k-1])
|
||||
}
|
||||
// TODO(austin): Implement interpolation
|
||||
|
||||
target := s.Weight() * pctile
|
||||
|
||||
// TODO(austin) If we had cumulative weights, we could
|
||||
// do this in log time.
|
||||
for i, weight := range s.Weights {
|
||||
target -= weight
|
||||
if target < 0 {
|
||||
return s.Xs[i]
|
||||
}
|
||||
}
|
||||
return s.Xs[len(s.Xs)-1]
|
||||
}
|
||||
|
||||
// IQR returns the interquartile range of the Sample.
|
||||
//
|
||||
// This is constant time if s.Sorted and s.Weights == nil.
|
||||
func (s Sample) IQR() float64 {
|
||||
if !s.Sorted {
|
||||
s = *s.Copy().Sort()
|
||||
}
|
||||
return s.Percentile(0.75) - s.Percentile(0.25)
|
||||
}
|
||||
|
||||
type sampleSorter struct {
|
||||
xs []float64
|
||||
weights []float64
|
||||
}
|
||||
|
||||
func (p *sampleSorter) Len() int {
|
||||
return len(p.xs)
|
||||
}
|
||||
|
||||
func (p *sampleSorter) Less(i, j int) bool {
|
||||
return p.xs[i] < p.xs[j]
|
||||
}
|
||||
|
||||
func (p *sampleSorter) Swap(i, j int) {
|
||||
p.xs[i], p.xs[j] = p.xs[j], p.xs[i]
|
||||
p.weights[i], p.weights[j] = p.weights[j], p.weights[i]
|
||||
}
|
||||
|
||||
// Sort sorts the samples in place in s and returns s.
|
||||
//
|
||||
// A sorted sample improves the performance of some algorithms.
|
||||
func (s *Sample) Sort() *Sample {
|
||||
if s.Sorted || sort.Float64sAreSorted(s.Xs) {
|
||||
// All set
|
||||
} else if s.Weights == nil {
|
||||
sort.Float64s(s.Xs)
|
||||
} else {
|
||||
sort.Sort(&sampleSorter{s.Xs, s.Weights})
|
||||
}
|
||||
s.Sorted = true
|
||||
return s
|
||||
}
|
||||
|
||||
// Copy returns a copy of the Sample.
|
||||
//
|
||||
// The returned Sample shares no data with the original, so they can
|
||||
// be modified (for example, sorted) independently.
|
||||
func (s Sample) Copy() *Sample {
|
||||
xs := make([]float64, len(s.Xs))
|
||||
copy(xs, s.Xs)
|
||||
|
||||
weights := []float64(nil)
|
||||
if s.Weights != nil {
|
||||
weights = make([]float64, len(s.Weights))
|
||||
copy(weights, s.Weights)
|
||||
}
|
||||
|
||||
return &Sample{xs, weights, s.Sorted}
|
||||
}
|
119
pkg/obistats/scaler.go
Normal file
119
pkg/obistats/scaler.go
Normal file
@ -0,0 +1,119 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// A Scaler is a function that scales and formats a measurement.
|
||||
// All measurements within a given table row are formatted
|
||||
// using the same scaler, so that the units are consistent
|
||||
// across the row.
|
||||
type Scaler func(float64) string
|
||||
|
||||
// NewScaler returns a Scaler appropriate for formatting
|
||||
// the measurement val, which has the given unit.
|
||||
func NewScaler(val float64, unit string) Scaler {
|
||||
if hasBaseUnit(unit, "ns/op") || hasBaseUnit(unit, "ns/GC") {
|
||||
return timeScaler(val)
|
||||
}
|
||||
|
||||
var format string
|
||||
var scale float64
|
||||
var suffix string
|
||||
|
||||
prescale := 1.0
|
||||
if hasBaseUnit(unit, "MB/s") {
|
||||
prescale = 1e6
|
||||
}
|
||||
|
||||
switch x := val * prescale; {
|
||||
case x >= 99500000000000:
|
||||
format, scale, suffix = "%.0f", 1e12, "T"
|
||||
case x >= 9950000000000:
|
||||
format, scale, suffix = "%.1f", 1e12, "T"
|
||||
case x >= 995000000000:
|
||||
format, scale, suffix = "%.2f", 1e12, "T"
|
||||
case x >= 99500000000:
|
||||
format, scale, suffix = "%.0f", 1e9, "G"
|
||||
case x >= 9950000000:
|
||||
format, scale, suffix = "%.1f", 1e9, "G"
|
||||
case x >= 995000000:
|
||||
format, scale, suffix = "%.2f", 1e9, "G"
|
||||
case x >= 99500000:
|
||||
format, scale, suffix = "%.0f", 1e6, "M"
|
||||
case x >= 9950000:
|
||||
format, scale, suffix = "%.1f", 1e6, "M"
|
||||
case x >= 995000:
|
||||
format, scale, suffix = "%.2f", 1e6, "M"
|
||||
case x >= 99500:
|
||||
format, scale, suffix = "%.0f", 1e3, "k"
|
||||
case x >= 9950:
|
||||
format, scale, suffix = "%.1f", 1e3, "k"
|
||||
case x >= 995:
|
||||
format, scale, suffix = "%.2f", 1e3, "k"
|
||||
case x >= 99.5:
|
||||
format, scale, suffix = "%.0f", 1, ""
|
||||
case x >= 9.95:
|
||||
format, scale, suffix = "%.1f", 1, ""
|
||||
default:
|
||||
format, scale, suffix = "%.2f", 1, ""
|
||||
}
|
||||
|
||||
if hasBaseUnit(unit, "B/op") || hasBaseUnit(unit, "bytes/op") || hasBaseUnit(unit, "bytes") {
|
||||
suffix += "B"
|
||||
}
|
||||
if hasBaseUnit(unit, "MB/s") {
|
||||
suffix += "B/s"
|
||||
}
|
||||
scale /= prescale
|
||||
|
||||
return func(val float64) string {
|
||||
return fmt.Sprintf(format+suffix, val/scale)
|
||||
}
|
||||
}
|
||||
|
||||
func timeScaler(ns float64) Scaler {
|
||||
var format string
|
||||
var scale float64
|
||||
switch x := ns / 1e9; {
|
||||
case x >= 99.5:
|
||||
format, scale = "%.0fs", 1
|
||||
case x >= 9.95:
|
||||
format, scale = "%.1fs", 1
|
||||
case x >= 0.995:
|
||||
format, scale = "%.2fs", 1
|
||||
case x >= 0.0995:
|
||||
format, scale = "%.0fms", 1000
|
||||
case x >= 0.00995:
|
||||
format, scale = "%.1fms", 1000
|
||||
case x >= 0.000995:
|
||||
format, scale = "%.2fms", 1000
|
||||
case x >= 0.0000995:
|
||||
format, scale = "%.0fµs", 1000*1000
|
||||
case x >= 0.00000995:
|
||||
format, scale = "%.1fµs", 1000*1000
|
||||
case x >= 0.000000995:
|
||||
format, scale = "%.2fµs", 1000*1000
|
||||
case x >= 0.0000000995:
|
||||
format, scale = "%.0fns", 1000*1000*1000
|
||||
case x >= 0.00000000995:
|
||||
format, scale = "%.1fns", 1000*1000*1000
|
||||
default:
|
||||
format, scale = "%.2fns", 1000*1000*1000
|
||||
}
|
||||
return func(ns float64) string {
|
||||
return fmt.Sprintf(format, ns/1e9*scale)
|
||||
}
|
||||
}
|
||||
|
||||
// hasBaseUnit reports whether s has unit unit.
|
||||
// For now, it reports whether s == unit or s ends in -unit.
|
||||
func hasBaseUnit(s, unit string) bool {
|
||||
return s == unit || strings.HasSuffix(s, "-"+unit)
|
||||
}
|
37
pkg/obistats/sort.go
Normal file
37
pkg/obistats/sort.go
Normal file
@ -0,0 +1,37 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// An Order defines a sort order for a table.
|
||||
// It reports whether t.Rows[i] should appear before t.Rows[j].
|
||||
type Order func(t *Table, i, j int) bool
|
||||
|
||||
// ByName sorts tables by the Benchmark name column
|
||||
func ByName(t *Table, i, j int) bool {
|
||||
return t.Rows[i].Benchmark < t.Rows[j].Benchmark
|
||||
}
|
||||
|
||||
// ByDelta sorts tables by the Delta column,
|
||||
// reversing the order when larger is better (for "speed" results).
|
||||
func ByDelta(t *Table, i, j int) bool {
|
||||
return math.Abs(t.Rows[i].PctDelta)*float64(t.Rows[i].Change) <
|
||||
math.Abs(t.Rows[j].PctDelta)*float64(t.Rows[j].Change)
|
||||
}
|
||||
|
||||
// Reverse returns the reverse of the given order.
|
||||
func Reverse(order Order) Order {
|
||||
return func(t *Table, i, j int) bool { return order(t, j, i) }
|
||||
}
|
||||
|
||||
// Sort sorts a Table t (in place) by the given order.
|
||||
func Sort(t *Table, order Order) {
|
||||
sort.SliceStable(t.Rows, func(i, j int) bool { return order(t, i, j) })
|
||||
}
|
213
pkg/obistats/table.go
Normal file
213
pkg/obistats/table.go
Normal file
@ -0,0 +1,213 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// A Table is a table for display in the benchstat output.
|
||||
type Table struct {
|
||||
Metric string
|
||||
OldNewDelta bool // is this an old-new-delta table?
|
||||
Configs []string
|
||||
Groups []string
|
||||
Rows []*Row
|
||||
}
|
||||
|
||||
// A Row is a table row for display in the benchstat output.
|
||||
type Row struct {
|
||||
Benchmark string // benchmark name
|
||||
Group string // group name
|
||||
Scaler Scaler // formatter for stats means
|
||||
Metrics []*Metrics // columns of statistics
|
||||
PctDelta float64 // unformatted percent change
|
||||
Delta string // formatted percent change
|
||||
Note string // additional information
|
||||
Change int // +1 better, -1 worse, 0 unchanged
|
||||
}
|
||||
|
||||
// Tables returns tables comparing the benchmarks in the collection.
|
||||
func (c *Collection) Tables() []*Table {
|
||||
deltaTest := c.DeltaTest
|
||||
if deltaTest == nil {
|
||||
deltaTest = UTest
|
||||
}
|
||||
alpha := c.Alpha
|
||||
if alpha == 0 {
|
||||
alpha = 0.05
|
||||
}
|
||||
|
||||
// Update statistics.
|
||||
for _, m := range c.Metrics {
|
||||
m.computeStats()
|
||||
}
|
||||
|
||||
var tables []*Table
|
||||
key := Key{}
|
||||
for _, key.Unit = range c.Units {
|
||||
table := new(Table)
|
||||
table.Configs = c.Configs
|
||||
table.Groups = c.Groups
|
||||
table.Metric = metricOf(key.Unit)
|
||||
table.OldNewDelta = len(c.Configs) == 2
|
||||
for _, key.Group = range c.Groups {
|
||||
for _, key.Benchmark = range c.Benchmarks[key.Group] {
|
||||
row := &Row{Benchmark: key.Benchmark}
|
||||
if len(c.Groups) > 1 {
|
||||
// Show group headers if there is more than one group.
|
||||
row.Group = key.Group
|
||||
}
|
||||
|
||||
for _, key.Config = range c.Configs {
|
||||
m := c.Metrics[key]
|
||||
if m == nil {
|
||||
row.Metrics = append(row.Metrics, new(Metrics))
|
||||
continue
|
||||
}
|
||||
row.Metrics = append(row.Metrics, m)
|
||||
if row.Scaler == nil {
|
||||
row.Scaler = NewScaler(m.Mean, m.Unit)
|
||||
}
|
||||
}
|
||||
|
||||
// If there are only two configs being compared, add stats.
|
||||
if table.OldNewDelta {
|
||||
k0 := key
|
||||
k0.Config = c.Configs[0]
|
||||
k1 := key
|
||||
k1.Config = c.Configs[1]
|
||||
old := c.Metrics[k0]
|
||||
new := c.Metrics[k1]
|
||||
// If one is missing, omit row entirely.
|
||||
// TODO: Control this better.
|
||||
if old == nil || new == nil {
|
||||
continue
|
||||
}
|
||||
pval, testerr := deltaTest(old, new)
|
||||
row.PctDelta = 0.00
|
||||
row.Delta = "~"
|
||||
if testerr == ErrZeroVariance {
|
||||
row.Note = "(zero variance)"
|
||||
} else if testerr == ErrSampleSize {
|
||||
row.Note = "(too few samples)"
|
||||
} else if testerr == ErrSamplesEqual {
|
||||
row.Note = "(all equal)"
|
||||
} else if testerr != nil {
|
||||
row.Note = fmt.Sprintf("(%s)", testerr)
|
||||
} else if pval < alpha {
|
||||
if new.Mean == old.Mean {
|
||||
row.Delta = "0.00%"
|
||||
} else {
|
||||
pct := ((new.Mean / old.Mean) - 1.0) * 100.0
|
||||
row.PctDelta = pct
|
||||
row.Delta = fmt.Sprintf("%+.2f%%", pct)
|
||||
if pct < 0 == (table.Metric != "speed") { // smaller is better, except speeds
|
||||
row.Change = +1
|
||||
} else {
|
||||
row.Change = -1
|
||||
}
|
||||
}
|
||||
}
|
||||
if row.Note == "" && pval != -1 {
|
||||
row.Note = fmt.Sprintf("(p=%0.3f n=%d+%d)", pval, len(old.RValues), len(new.RValues))
|
||||
}
|
||||
}
|
||||
|
||||
table.Rows = append(table.Rows, row)
|
||||
}
|
||||
}
|
||||
|
||||
if len(table.Rows) > 0 {
|
||||
if c.Order != nil {
|
||||
Sort(table, c.Order)
|
||||
}
|
||||
if c.AddGeoMean {
|
||||
addGeomean(c, table, key.Unit, table.OldNewDelta)
|
||||
}
|
||||
tables = append(tables, table)
|
||||
}
|
||||
}
|
||||
|
||||
return tables
|
||||
}
|
||||
|
||||
var metricSuffix = map[string]string{
|
||||
"ns/op": "time/op",
|
||||
"ns/GC": "time/GC",
|
||||
"B/op": "alloc/op",
|
||||
"MB/s": "speed",
|
||||
}
|
||||
|
||||
// metricOf returns the name of the metric with the given unit.
|
||||
func metricOf(unit string) string {
|
||||
if s := metricSuffix[unit]; s != "" {
|
||||
return s
|
||||
}
|
||||
for s, suff := range metricSuffix {
|
||||
if dashs := "-" + s; strings.HasSuffix(unit, dashs) {
|
||||
prefix := strings.TrimSuffix(unit, dashs)
|
||||
return prefix + "-" + suff
|
||||
}
|
||||
}
|
||||
return unit
|
||||
}
|
||||
|
||||
// addGeomean adds a "geomean" row to the table,
|
||||
// showing the geometric mean of all the benchmarks.
|
||||
func addGeomean(c *Collection, t *Table, unit string, delta bool) {
|
||||
row := &Row{Benchmark: "[Geo mean]"}
|
||||
key := Key{Unit: unit}
|
||||
geomeans := []float64{}
|
||||
maxCount := 0
|
||||
for _, key.Config = range c.Configs {
|
||||
var means []float64
|
||||
for _, key.Group = range c.Groups {
|
||||
for _, key.Benchmark = range c.Benchmarks[key.Group] {
|
||||
m := c.Metrics[key]
|
||||
// Omit 0 values from the geomean calculation,
|
||||
// as these either make the geomean undefined
|
||||
// or zero (depending on who you ask). This
|
||||
// typically comes up with things like
|
||||
// allocation counts, where it's fine to just
|
||||
// ignore the benchmark.
|
||||
if m != nil && m.Mean != 0 {
|
||||
means = append(means, m.Mean)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(means) > maxCount {
|
||||
maxCount = len(means)
|
||||
}
|
||||
if len(means) == 0 {
|
||||
row.Metrics = append(row.Metrics, new(Metrics))
|
||||
delta = false
|
||||
} else {
|
||||
geomean := GeoMean(means)
|
||||
geomeans = append(geomeans, geomean)
|
||||
if row.Scaler == nil {
|
||||
row.Scaler = NewScaler(geomean, unit)
|
||||
}
|
||||
row.Metrics = append(row.Metrics, &Metrics{
|
||||
Unit: unit,
|
||||
Mean: geomean,
|
||||
})
|
||||
}
|
||||
}
|
||||
if maxCount <= 1 {
|
||||
// Only one benchmark contributed to this geomean.
|
||||
// Since the geomean is the same as the benchmark
|
||||
// result, don't bother outputting it.
|
||||
return
|
||||
}
|
||||
if delta {
|
||||
pct := ((geomeans[1] / geomeans[0]) - 1.0) * 100.0
|
||||
row.PctDelta = pct
|
||||
row.Delta = fmt.Sprintf("%+.2f%%", pct)
|
||||
}
|
||||
t.Rows = append(t.Rows, row)
|
||||
}
|
37
pkg/obistats/tdist.go
Normal file
37
pkg/obistats/tdist.go
Normal file
@ -0,0 +1,37 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import "math"
|
||||
|
||||
// A TDist is a Student's t-distribution with V degrees of freedom.
|
||||
type TDist struct {
|
||||
V float64
|
||||
}
|
||||
|
||||
// PDF is the probability distribution function
|
||||
func (t TDist) PDF(x float64) float64 {
|
||||
return math.Exp(lgamma((t.V+1)/2)-lgamma(t.V/2)) /
|
||||
math.Sqrt(t.V*math.Pi) * math.Pow(1+(x*x)/t.V, -(t.V+1)/2)
|
||||
}
|
||||
|
||||
// CDF is the cumulative distribution function
|
||||
func (t TDist) CDF(x float64) float64 {
|
||||
if x == 0 {
|
||||
return 0.5
|
||||
} else if x > 0 {
|
||||
return 1 - 0.5*mathBetaInc(t.V/(t.V+x*x), t.V/2, 0.5)
|
||||
} else if x < 0 {
|
||||
return 1 - t.CDF(-x)
|
||||
} else {
|
||||
return math.NaN()
|
||||
}
|
||||
}
|
||||
|
||||
// Bounds ...
|
||||
func (t TDist) Bounds() (float64, float64) {
|
||||
return -4, 4
|
||||
}
|
143
pkg/obistats/ttest.go
Normal file
143
pkg/obistats/ttest.go
Normal file
@ -0,0 +1,143 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import (
|
||||
"math"
|
||||
)
|
||||
|
||||
// A TTestResult is the result of a t-test.
|
||||
type TTestResult struct {
|
||||
// N1 and N2 are the sizes of the input samples. For a
|
||||
// one-sample t-test, N2 is 0.
|
||||
N1, N2 int
|
||||
|
||||
// T is the value of the t-statistic for this t-test.
|
||||
T float64
|
||||
|
||||
// DoF is the degrees of freedom for this t-test.
|
||||
DoF float64
|
||||
|
||||
// AltHypothesis specifies the alternative hypothesis tested
|
||||
// by this test against the null hypothesis that there is no
|
||||
// difference in the means of the samples.
|
||||
AltHypothesis LocationHypothesis
|
||||
|
||||
// P is p-value for this t-test for the given null hypothesis.
|
||||
P float64
|
||||
}
|
||||
|
||||
func newTTestResult(n1, n2 int, t, dof float64, alt LocationHypothesis) *TTestResult {
|
||||
dist := TDist{dof}
|
||||
var p float64
|
||||
switch alt {
|
||||
case LocationDiffers:
|
||||
p = 2 * (1 - dist.CDF(math.Abs(t)))
|
||||
case LocationLess:
|
||||
p = dist.CDF(t)
|
||||
case LocationGreater:
|
||||
p = 1 - dist.CDF(t)
|
||||
}
|
||||
return &TTestResult{N1: n1, N2: n2, T: t, DoF: dof, AltHypothesis: alt, P: p}
|
||||
}
|
||||
|
||||
// A TTestSample is a sample that can be used for a one or two sample
|
||||
// t-test.
|
||||
type TTestSample interface {
|
||||
Weight() float64
|
||||
Mean() float64
|
||||
Variance() float64
|
||||
}
|
||||
|
||||
var ()
|
||||
|
||||
// TwoSampleTTest performs a two-sample (unpaired) Student's t-test on
|
||||
// samples x1 and x2. This is a test of the null hypothesis that x1
|
||||
// and x2 are drawn from populations with equal means. It assumes x1
|
||||
// and x2 are independent samples, that the distributions have equal
|
||||
// variance, and that the populations are normally distributed.
|
||||
func TwoSampleTTest(x1, x2 TTestSample, alt LocationHypothesis) (*TTestResult, error) {
|
||||
n1, n2 := x1.Weight(), x2.Weight()
|
||||
if n1 == 0 || n2 == 0 {
|
||||
return nil, ErrSampleSize
|
||||
}
|
||||
v1, v2 := x1.Variance(), x2.Variance()
|
||||
if v1 == 0 && v2 == 0 {
|
||||
return nil, ErrZeroVariance
|
||||
}
|
||||
|
||||
dof := n1 + n2 - 2
|
||||
v12 := ((n1-1)*v1 + (n2-1)*v2) / dof
|
||||
t := (x1.Mean() - x2.Mean()) / math.Sqrt(v12*(1/n1+1/n2))
|
||||
return newTTestResult(int(n1), int(n2), t, dof, alt), nil
|
||||
}
|
||||
|
||||
// TwoSampleWelchTTest performs a two-sample (unpaired) Welch's t-test
|
||||
// on samples x1 and x2. This is like TwoSampleTTest, but does not
|
||||
// assume the distributions have equal variance.
|
||||
func TwoSampleWelchTTest(x1, x2 TTestSample, alt LocationHypothesis) (*TTestResult, error) {
|
||||
n1, n2 := x1.Weight(), x2.Weight()
|
||||
if n1 <= 1 || n2 <= 1 {
|
||||
// TODO: Can we still do this with n == 1?
|
||||
return nil, ErrSampleSize
|
||||
}
|
||||
v1, v2 := x1.Variance(), x2.Variance()
|
||||
if v1 == 0 && v2 == 0 {
|
||||
return nil, ErrZeroVariance
|
||||
}
|
||||
|
||||
dof := math.Pow(v1/n1+v2/n2, 2) /
|
||||
(math.Pow(v1/n1, 2)/(n1-1) + math.Pow(v2/n2, 2)/(n2-1))
|
||||
s := math.Sqrt(v1/n1 + v2/n2)
|
||||
t := (x1.Mean() - x2.Mean()) / s
|
||||
return newTTestResult(int(n1), int(n2), t, dof, alt), nil
|
||||
}
|
||||
|
||||
// PairedTTest performs a two-sample paired t-test on samples x1 and
|
||||
// x2. If μ0 is non-zero, this tests if the average of the difference
|
||||
// is significantly different from μ0. If x1 and x2 are identical,
|
||||
// this returns nil.
|
||||
func PairedTTest(x1, x2 []float64, μ0 float64, alt LocationHypothesis) (*TTestResult, error) {
|
||||
if len(x1) != len(x2) {
|
||||
return nil, ErrMismatchedSamples
|
||||
}
|
||||
if len(x1) <= 1 {
|
||||
// TODO: Can we still do this with n == 1?
|
||||
return nil, ErrSampleSize
|
||||
}
|
||||
|
||||
dof := float64(len(x1) - 1)
|
||||
|
||||
diff := make([]float64, len(x1))
|
||||
for i := range x1 {
|
||||
diff[i] = x1[i] - x2[i]
|
||||
}
|
||||
sd := StdDev(diff)
|
||||
if sd == 0 {
|
||||
// TODO: Can we still do the test?
|
||||
return nil, ErrZeroVariance
|
||||
}
|
||||
t := (Mean(diff) - μ0) * math.Sqrt(float64(len(x1))) / sd
|
||||
return newTTestResult(len(x1), len(x2), t, dof, alt), nil
|
||||
}
|
||||
|
||||
// OneSampleTTest performs a one-sample t-test on sample x. This tests
|
||||
// the null hypothesis that the population mean is equal to μ0. This
|
||||
// assumes the distribution of the population of sample means is
|
||||
// normal.
|
||||
func OneSampleTTest(x TTestSample, μ0 float64, alt LocationHypothesis) (*TTestResult, error) {
|
||||
n, v := x.Weight(), x.Variance()
|
||||
if n == 0 {
|
||||
return nil, ErrSampleSize
|
||||
}
|
||||
if v == 0 {
|
||||
// TODO: Can we still do the test?
|
||||
return nil, ErrZeroVariance
|
||||
}
|
||||
dof := n - 1
|
||||
t := (x.Mean() - μ0) * math.Sqrt(n) / math.Sqrt(v)
|
||||
return newTTestResult(int(n), 0, t, dof, alt), nil
|
||||
}
|
386
pkg/obistats/udist.go
Normal file
386
pkg/obistats/udist.go
Normal file
@ -0,0 +1,386 @@
|
||||
package obistats
|
||||
|
||||
//
|
||||
// Dupplicated code from internal module available at :
|
||||
// https://github.com/golang-design/bench.git
|
||||
//
|
||||
|
||||
import "math"
|
||||
|
||||
// A UDist is the discrete probability distribution of the
|
||||
// Mann-Whitney U statistic for a pair of samples of sizes N1 and N2.
|
||||
//
|
||||
// The details of computing this distribution with no ties can be
|
||||
// found in Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of
|
||||
// Whether one of Two Random Variables is Stochastically Larger than
|
||||
// the Other". Annals of Mathematical Statistics 18 (1): 50–60.
|
||||
// Computing this distribution in the presence of ties is described in
|
||||
// Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer".
|
||||
// Journal of the American Statistical Association 61 (315): 772-787
|
||||
// and Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney
|
||||
// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7:
|
||||
// 805-813 (the former paper contains details that are glossed over in
|
||||
// the latter paper but has mathematical typesetting issues, so it's
|
||||
// easiest to get the context from the former paper and the details
|
||||
// from the latter).
|
||||
type UDist struct {
|
||||
N1, N2 int
|
||||
|
||||
// T is the count of the number of ties at each rank in the
|
||||
// input distributions. T may be nil, in which case it is
|
||||
// assumed there are no ties (which is equivalent to an M+N
|
||||
// slice of 1s). It must be the case that Sum(T) == M+N.
|
||||
T []int
|
||||
}
|
||||
|
||||
// hasTies returns true if d has any tied samples.
|
||||
func (d UDist) hasTies() bool {
|
||||
for _, t := range d.T {
|
||||
if t > 1 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// p returns the p_{d.N1,d.N2} function defined by Mann, Whitney 1947
|
||||
// for values of U from 0 up to and including the U argument.
|
||||
//
|
||||
// This algorithm runs in Θ(N1*N2*U) = O(N1²N2²) time and is quite
|
||||
// fast for small values of N1 and N2. However, it does not handle ties.
|
||||
func (d UDist) p(U int) []float64 {
|
||||
// This is a dynamic programming implementation of the
|
||||
// recursive recurrence definition given by Mann and Whitney:
|
||||
//
|
||||
// p_{n,m}(U) = (n * p_{n-1,m}(U-m) + m * p_{n,m-1}(U)) / (n+m)
|
||||
// p_{n,m}(U) = 0 if U < 0
|
||||
// p_{0,m}(U) = p{n,0}(U) = 1 / nCr(m+n, n) if U = 0
|
||||
// = 0 if U > 0
|
||||
//
|
||||
// (Note that there is a typo in the original paper. The first
|
||||
// recursive application of p should be for U-m, not U-M.)
|
||||
//
|
||||
// Since p{n,m} only depends on p{n-1,m} and p{n,m-1}, we only
|
||||
// need to store one "plane" of the three dimensional space at
|
||||
// a time.
|
||||
//
|
||||
// Furthermore, p_{n,m} = p_{m,n}, so we only construct values
|
||||
// for n <= m and obtain the rest through symmetry.
|
||||
//
|
||||
// We organize the computed values of p as followed:
|
||||
//
|
||||
// n → N
|
||||
// m *
|
||||
// ↓ * *
|
||||
// * * *
|
||||
// * * * *
|
||||
// * * * *
|
||||
// M * * * *
|
||||
//
|
||||
// where each * is a slice indexed by U. The code below
|
||||
// computes these left-to-right, top-to-bottom, so it only
|
||||
// stores one row of this matrix at a time. Furthermore,
|
||||
// computing an element in a given U slice only depends on the
|
||||
// same and smaller values of U, so we can overwrite the U
|
||||
// slice we're computing in place as long as we start with the
|
||||
// largest value of U. Finally, even though the recurrence
|
||||
// depends on (n,m) above the diagonal and we use symmetry to
|
||||
// mirror those across the diagonal to (m,n), the mirrored
|
||||
// indexes are always available in the current row, so this
|
||||
// mirroring does not interfere with our ability to recycle
|
||||
// state.
|
||||
|
||||
N, M := d.N1, d.N2
|
||||
if N > M {
|
||||
N, M = M, N
|
||||
}
|
||||
|
||||
memo := make([][]float64, N+1)
|
||||
for n := range memo {
|
||||
memo[n] = make([]float64, U+1)
|
||||
}
|
||||
|
||||
for m := 0; m <= M; m++ {
|
||||
// Compute p_{0,m}. This is zero except for U=0.
|
||||
memo[0][0] = 1
|
||||
|
||||
// Compute the remainder of this row.
|
||||
nlim := N
|
||||
if m < nlim {
|
||||
nlim = m
|
||||
}
|
||||
for n := 1; n <= nlim; n++ {
|
||||
lp := memo[n-1] // p_{n-1,m}
|
||||
var rp []float64
|
||||
if n <= m-1 {
|
||||
rp = memo[n] // p_{n,m-1}
|
||||
} else {
|
||||
rp = memo[m-1] // p{m-1,n} and m==n
|
||||
}
|
||||
|
||||
// For a given n,m, U is at most n*m.
|
||||
//
|
||||
// TODO: Actually, it's at most ⌈n*m/2⌉, but
|
||||
// then we need to use more complex symmetries
|
||||
// in the inner loop below.
|
||||
ulim := n * m
|
||||
if U < ulim {
|
||||
ulim = U
|
||||
}
|
||||
|
||||
out := memo[n] // p_{n,m}
|
||||
nplusm := float64(n + m)
|
||||
for U1 := ulim; U1 >= 0; U1-- {
|
||||
l := 0.0
|
||||
if U1-m >= 0 {
|
||||
l = float64(n) * lp[U1-m]
|
||||
}
|
||||
r := float64(m) * rp[U1]
|
||||
out[U1] = (l + r) / nplusm
|
||||
}
|
||||
}
|
||||
}
|
||||
return memo[N]
|
||||
}
|
||||
|
||||
type ukey struct {
|
||||
n1 int // size of first sample
|
||||
twoU int // 2*U statistic for this permutation
|
||||
}
|
||||
|
||||
// This computes the cumulative counts of the Mann-Whitney U
|
||||
// distribution in the presence of ties using the computation from
|
||||
// Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney
|
||||
// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7:
|
||||
// 805-813, with much guidance from appendix L of Klotz, A
|
||||
// Computational Approach to Statistics.
|
||||
//
|
||||
// makeUmemo constructs a table memo[K][ukey{n1, 2*U}], where K is the
|
||||
// number of ranks (up to len(t)), n1 is the size of the first sample
|
||||
// (up to the n1 argument), and U is the U statistic (up to the
|
||||
// argument twoU/2). The value of an entry in the memo table is the
|
||||
// number of permutations of a sample of size n1 in a ranking with tie
|
||||
// vector t[:K] having a U statistic <= U.
|
||||
func makeUmemo(twoU, n1 int, t []int) []map[ukey]float64 {
|
||||
// Another candidate for a fast implementation is van de Wiel,
|
||||
// "The split-up algorithm: a fast symbolic method for
|
||||
// computing p-values of distribution-free statistics". This
|
||||
// is what's used by R's coin package. It's a comparatively
|
||||
// recent publication, so it's presumably faster (or perhaps
|
||||
// just more general) than previous techniques, but I can't
|
||||
// get my hands on the paper.
|
||||
//
|
||||
// TODO: ~40% of this function's time is spent in mapassign on
|
||||
// the assignment lines in the two loops and another ~20% in
|
||||
// map access and iteration. Improving map behavior or
|
||||
// replacing the maps altogether with some other constant-time
|
||||
// structure could double performance.
|
||||
//
|
||||
// TODO: The worst case for this function is when there are
|
||||
// few ties. Yet the best case overall is when there are *no*
|
||||
// ties. Can we get the best of both worlds? Use the fast
|
||||
// algorithm for the most part when there are few ties and mix
|
||||
// in the general algorithm just where we need it? That's
|
||||
// certainly possible for sub-problems where t[:k] has no
|
||||
// ties, but that doesn't help if t[0] has a tie but nothing
|
||||
// else does. Is it possible to rearrange the ranks without
|
||||
// messing up our computation of the U statistic for
|
||||
// sub-problems?
|
||||
|
||||
K := len(t)
|
||||
|
||||
// Compute a coefficients. The a slice is indexed by k (a[0]
|
||||
// is unused).
|
||||
a := make([]int, K+1)
|
||||
a[1] = t[0]
|
||||
for k := 2; k <= K; k++ {
|
||||
a[k] = a[k-1] + t[k-2] + t[k-1]
|
||||
}
|
||||
|
||||
// Create the memo table for the counts function, A. The A
|
||||
// slice is indexed by k (A[0] is unused).
|
||||
//
|
||||
// In "The Mann Whitney Distribution Using Linked Lists", they
|
||||
// use linked lists (*gasp*) for this, but within each K it's
|
||||
// really just a memoization table, so it's faster to use a
|
||||
// map. The outer structure is a slice indexed by k because we
|
||||
// need to find all memo entries with certain values of k.
|
||||
//
|
||||
// TODO: The n1 and twoU values in the ukeys follow strict
|
||||
// patterns. For each K value, the n1 values are every integer
|
||||
// between two bounds. For each (K, n1) value, the twoU values
|
||||
// are every integer multiple of a certain base between two
|
||||
// bounds. It might be worth turning these into directly
|
||||
// indexible slices.
|
||||
A := make([]map[ukey]float64, K+1)
|
||||
A[K] = map[ukey]float64{ukey{n1: n1, twoU: twoU}: 0}
|
||||
|
||||
// Compute memo table (k, n1, twoU) triples from high K values
|
||||
// to low K values. This drives the recurrence relation
|
||||
// downward to figure out all of the needed argument triples.
|
||||
//
|
||||
// TODO: Is it possible to generate this table bottom-up? If
|
||||
// so, this could be a pure dynamic programming algorithm and
|
||||
// we could discard the K dimension. We could at least store
|
||||
// the inputs in a more compact representation that replaces
|
||||
// the twoU dimension with an interval and a step size (as
|
||||
// suggested by Cheung, Klotz, not that they make it at all
|
||||
// clear *why* they're suggesting this).
|
||||
tsum := sumint(t) // always ∑ t[0:k]
|
||||
for k := K - 1; k >= 2; k-- {
|
||||
tsum -= t[k]
|
||||
A[k] = make(map[ukey]float64)
|
||||
|
||||
// Construct A[k] from A[k+1].
|
||||
for A_kplus1 := range A[k+1] {
|
||||
rkLow := maxint(0, A_kplus1.n1-tsum)
|
||||
rkHigh := minint(A_kplus1.n1, t[k])
|
||||
for rk := rkLow; rk <= rkHigh; rk++ {
|
||||
twoU_k := A_kplus1.twoU - rk*(a[k+1]-2*A_kplus1.n1+rk)
|
||||
n1_k := A_kplus1.n1 - rk
|
||||
if twoUmin(n1_k, t[:k], a) <= twoU_k && twoU_k <= twoUmax(n1_k, t[:k], a) {
|
||||
key := ukey{n1: n1_k, twoU: twoU_k}
|
||||
A[k][key] = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fill counts in memo table from low K values to high K
|
||||
// values. This unwinds the recurrence relation.
|
||||
|
||||
// Start with K==2 base case.
|
||||
//
|
||||
// TODO: Later computations depend on these, but these don't
|
||||
// depend on anything (including each other), so if K==2, we
|
||||
// can skip the memo table altogether.
|
||||
if K < 2 {
|
||||
panic("K < 2")
|
||||
}
|
||||
N_2 := t[0] + t[1]
|
||||
for A_2i := range A[2] {
|
||||
Asum := 0.0
|
||||
r2Low := maxint(0, A_2i.n1-t[0])
|
||||
r2High := (A_2i.twoU - A_2i.n1*(t[0]-A_2i.n1)) / N_2
|
||||
for r2 := r2Low; r2 <= r2High; r2++ {
|
||||
Asum += mathChoose(t[0], A_2i.n1-r2) *
|
||||
mathChoose(t[1], r2)
|
||||
}
|
||||
A[2][A_2i] = Asum
|
||||
}
|
||||
|
||||
// Derive counts for the rest of the memo table.
|
||||
tsum = t[0] // always ∑ t[0:k-1]
|
||||
for k := 3; k <= K; k++ {
|
||||
tsum += t[k-2]
|
||||
|
||||
// Compute A[k] counts from A[k-1] counts.
|
||||
for A_ki := range A[k] {
|
||||
Asum := 0.0
|
||||
rkLow := maxint(0, A_ki.n1-tsum)
|
||||
rkHigh := minint(A_ki.n1, t[k-1])
|
||||
for rk := rkLow; rk <= rkHigh; rk++ {
|
||||
twoU_kminus1 := A_ki.twoU - rk*(a[k]-2*A_ki.n1+rk)
|
||||
n1_kminus1 := A_ki.n1 - rk
|
||||
x, ok := A[k-1][ukey{n1: n1_kminus1, twoU: twoU_kminus1}]
|
||||
if !ok && twoUmax(n1_kminus1, t[:k-1], a) < twoU_kminus1 {
|
||||
x = mathChoose(tsum, n1_kminus1)
|
||||
}
|
||||
Asum += x * mathChoose(t[k-1], rk)
|
||||
}
|
||||
A[k][A_ki] = Asum
|
||||
}
|
||||
}
|
||||
|
||||
return A
|
||||
}
|
||||
|
||||
func twoUmin(n1 int, t, a []int) int {
|
||||
K := len(t)
|
||||
twoU := -n1 * n1
|
||||
n1_k := n1
|
||||
for k := 1; k <= K; k++ {
|
||||
twoU_k := minint(n1_k, t[k-1])
|
||||
twoU += twoU_k * a[k]
|
||||
n1_k -= twoU_k
|
||||
}
|
||||
return twoU
|
||||
}
|
||||
|
||||
func twoUmax(n1 int, t, a []int) int {
|
||||
K := len(t)
|
||||
twoU := -n1 * n1
|
||||
n1_k := n1
|
||||
for k := K; k > 0; k-- {
|
||||
twoU_k := minint(n1_k, t[k-1])
|
||||
twoU += twoU_k * a[k]
|
||||
n1_k -= twoU_k
|
||||
}
|
||||
return twoU
|
||||
}
|
||||
|
||||
func (d UDist) PMF(U float64) float64 {
|
||||
if U < 0 || U >= 0.5+float64(d.N1*d.N2) {
|
||||
return 0
|
||||
}
|
||||
|
||||
if d.hasTies() {
|
||||
// makeUmemo computes the CDF directly. Take its
|
||||
// difference to get the PMF.
|
||||
p1, ok1 := makeUmemo(int(2*U)-1, d.N1, d.T)[len(d.T)][ukey{d.N1, int(2*U) - 1}]
|
||||
p2, ok2 := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}]
|
||||
if !ok1 || !ok2 {
|
||||
panic("makeUmemo did not return expected memoization table")
|
||||
}
|
||||
return (p2 - p1) / mathChoose(d.N1+d.N2, d.N1)
|
||||
}
|
||||
|
||||
// There are no ties. Use the fast algorithm. U must be integral.
|
||||
Ui := int(math.Floor(U))
|
||||
// TODO: Use symmetry to minimize U
|
||||
return d.p(Ui)[Ui]
|
||||
}
|
||||
|
||||
func (d UDist) CDF(U float64) float64 {
|
||||
if U < 0 {
|
||||
return 0
|
||||
} else if U >= float64(d.N1*d.N2) {
|
||||
return 1
|
||||
}
|
||||
|
||||
if d.hasTies() {
|
||||
// TODO: Minimize U?
|
||||
p, ok := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}]
|
||||
if !ok {
|
||||
panic("makeUmemo did not return expected memoization table")
|
||||
}
|
||||
return p / mathChoose(d.N1+d.N2, d.N1)
|
||||
}
|
||||
|
||||
// There are no ties. Use the fast algorithm. U must be integral.
|
||||
Ui := int(math.Floor(U))
|
||||
// The distribution is symmetric around U = m * n / 2. Sum up
|
||||
// whichever tail is smaller.
|
||||
flip := Ui >= (d.N1*d.N2+1)/2
|
||||
if flip {
|
||||
Ui = d.N1*d.N2 - Ui - 1
|
||||
}
|
||||
pdfs := d.p(Ui)
|
||||
p := 0.0
|
||||
for _, pdf := range pdfs[:Ui+1] {
|
||||
p += pdf
|
||||
}
|
||||
if flip {
|
||||
p = 1 - p
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
func (d UDist) Step() float64 {
|
||||
return 0.5
|
||||
}
|
||||
|
||||
func (d UDist) Bounds() (float64, float64) {
|
||||
// TODO: More precise bounds when there are ties.
|
||||
return 0, float64(d.N1 * d.N2)
|
||||
}
|
@ -1,6 +1,8 @@
|
||||
package obicleandb
|
||||
|
||||
import (
|
||||
"math/rand"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||
@ -18,6 +20,114 @@ func SequenceTrust(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
||||
func MakeSequenceFamilyGenusWorker(references obiseq.BioSequenceSlice) obiseq.SeqWorker {
|
||||
|
||||
genus := make(map[int]*obiseq.BioSequenceSlice)
|
||||
family := make(map[int]*obiseq.BioSequenceSlice)
|
||||
|
||||
for _, ref := range references {
|
||||
g, ok := ref.GetIntAttribute("genus_taxid")
|
||||
f, ok := ref.GetIntAttribute("family_taxid")
|
||||
|
||||
gs, ok := genus[g]
|
||||
if !ok {
|
||||
gs = obiseq.NewBioSequenceSlice(0)
|
||||
genus[g] = gs
|
||||
}
|
||||
|
||||
*gs = append(*gs, ref)
|
||||
|
||||
fs, ok := family[f]
|
||||
if !ok {
|
||||
fs = obiseq.NewBioSequenceSlice(0)
|
||||
family[f] = fs
|
||||
}
|
||||
|
||||
*fs = append(*fs, ref)
|
||||
}
|
||||
|
||||
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
g, _ := sequence.GetIntAttribute("genus_taxid")
|
||||
sequence.SetAttribute("obicleandb_level", "genus")
|
||||
|
||||
gs := genus[g]
|
||||
|
||||
indist := make([]float64, 0, gs.Len())
|
||||
for _, s := range *gs {
|
||||
if s != sequence {
|
||||
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
|
||||
indist = append(indist, float64(lali-lca))
|
||||
}
|
||||
}
|
||||
nindist := len(indist)
|
||||
|
||||
pval := 0.0
|
||||
|
||||
f, _ := sequence.GetIntAttribute("family_taxid")
|
||||
fs := family[f]
|
||||
|
||||
if nindist < 5 {
|
||||
sequence.SetAttribute("obicleandb_level", "family")
|
||||
|
||||
for _, s := range *fs {
|
||||
gf, _ := s.GetIntAttribute("genus_taxid")
|
||||
if g != gf {
|
||||
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
|
||||
indist = append(indist, float64(lali-lca))
|
||||
}
|
||||
}
|
||||
|
||||
nindist = len(indist)
|
||||
}
|
||||
|
||||
if nindist > 0 {
|
||||
|
||||
next := nindist
|
||||
if next <= 20 {
|
||||
next = 20
|
||||
}
|
||||
|
||||
outdist := make([]float64, 0, nindist)
|
||||
p := rand.Perm(references.Len())
|
||||
i := 0
|
||||
for _, ir := range p {
|
||||
s := references[ir]
|
||||
ff, _ := s.GetIntAttribute("family_taxid")
|
||||
|
||||
if ff != f {
|
||||
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
|
||||
outdist = append(outdist, float64(lali-lca))
|
||||
i += 1
|
||||
if i >= next {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
res, err := obistats.MannWhitneyUTest(outdist, indist, obistats.LocationGreater)
|
||||
|
||||
if err == nil {
|
||||
pval = res.P
|
||||
}
|
||||
|
||||
level, _ := sequence.GetAttribute("obicleandb_level")
|
||||
log.Warnf("%s - level: %v", sequence.Id(), level)
|
||||
log.Warnf("%s - gdist: %v", sequence.Id(), indist)
|
||||
log.Warnf("%s - fdist: %v", sequence.Id(), outdist)
|
||||
log.Warnf("%s - pval: %f", sequence.Id(), pval)
|
||||
} else {
|
||||
sequence.SetAttribute("obicleandb_level", "none")
|
||||
}
|
||||
|
||||
sequence.SetAttribute("obicleandb_trusted", pval)
|
||||
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
||||
return f
|
||||
|
||||
}
|
||||
|
||||
func diagCoord(x, y, n int) int {
|
||||
if x > y {
|
||||
x, y = y, x
|
||||
@ -160,19 +270,26 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
obioptions.CLIParallelWorkers(),
|
||||
)
|
||||
|
||||
genera_iterator, err := obichunk.ISequenceChunk(
|
||||
annotated,
|
||||
obiseq.AnnotationClassifier("genus_taxid", "NA"),
|
||||
)
|
||||
references := annotated.Load()
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
mannwithney := MakeSequenceFamilyGenusWorker(references)
|
||||
|
||||
trusted := genera_iterator.MakeISliceWorker(
|
||||
SequenceTrustSlice,
|
||||
false,
|
||||
)
|
||||
partof := obiiter.IBatchOver(references,
|
||||
obioptions.CLIBatchSize()).Speed("Testing belonging to genus")
|
||||
|
||||
return trusted
|
||||
// genera_iterator, err := obichunk.ISequenceChunk(
|
||||
// annotated,
|
||||
// obiseq.AnnotationClassifier("genus_taxid", "NA"),
|
||||
// )
|
||||
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
|
||||
// trusted := genera_iterator.MakeISliceWorker(
|
||||
// SequenceTrustSlice,
|
||||
// false,
|
||||
// )
|
||||
|
||||
return partof.MakeIWorker(mannwithney, true)
|
||||
}
|
||||
|
44
pkg/obitools/obidemerge/demerge.go
Normal file
44
pkg/obitools/obidemerge/demerge.go
Normal file
@ -0,0 +1,44 @@
|
||||
package obidemerge
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
func MakeDemergeWorker(key string) obiseq.SeqWorker {
|
||||
desc := obiseq.MakeStatsOnDescription(key)
|
||||
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
|
||||
if sequence.HasStatsOn(key) {
|
||||
stats := sequence.StatsOn(desc, "NA")
|
||||
sequence.DeleteAttribute(obiseq.StatsOnSlotName(key))
|
||||
slice := obiseq.NewBioSequenceSlice(len(stats))
|
||||
i := 0
|
||||
|
||||
for k, v := range stats {
|
||||
(*slice)[i] = sequence.Copy()
|
||||
(*slice)[i].SetAttribute(key, k)
|
||||
(*slice)[i].SetCount(v)
|
||||
i++
|
||||
}
|
||||
|
||||
return *slice, nil
|
||||
}
|
||||
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
||||
return obiseq.SeqWorker(f)
|
||||
}
|
||||
|
||||
func CLIDemergeSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
|
||||
if CLIHasSlotToDemerge() {
|
||||
|
||||
worker := MakeDemergeWorker(CLIDemergeSlot())
|
||||
return iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers(), 0)
|
||||
}
|
||||
|
||||
return iterator
|
||||
}
|
30
pkg/obitools/obidemerge/options.go
Normal file
30
pkg/obitools/obidemerge/options.go
Normal file
@ -0,0 +1,30 @@
|
||||
package obidemerge
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _Demerge = ""
|
||||
|
||||
func DemergeOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
options.StringVar(&_Demerge, "demerge", _Demerge,
|
||||
options.Alias("d"),
|
||||
options.Description("Indicates which slot has to be demerged."))
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
// the obipcr command
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
DemergeOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIDemergeSlot() string {
|
||||
return _Demerge
|
||||
}
|
||||
|
||||
func CLIHasSlotToDemerge() bool {
|
||||
return _Demerge != ""
|
||||
}
|
132
pkg/obitools/obijoin/join.go
Normal file
132
pkg/obitools/obijoin/join.go
Normal file
@ -0,0 +1,132 @@
|
||||
package obijoin
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type IndexedSequenceSlice struct {
|
||||
Sequences obiseq.BioSequenceSlice
|
||||
Indices []map[interface{}]*obiutils.Set[int]
|
||||
}
|
||||
|
||||
func (s IndexedSequenceSlice) Len() int {
|
||||
return len(s.Sequences)
|
||||
}
|
||||
|
||||
func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice {
|
||||
var keeps obiutils.Set[int]
|
||||
|
||||
for i, v := range s.Indices {
|
||||
if i == 0 {
|
||||
keeps = *v[keys[0]]
|
||||
} else {
|
||||
keeps = keeps.Intersection(*v[keys[i]])
|
||||
}
|
||||
}
|
||||
|
||||
rep := obiseq.MakeBioSequenceSlice(len(keeps))
|
||||
for i, v := range keeps.Members() {
|
||||
rep[i] = s.Sequences[v]
|
||||
}
|
||||
|
||||
return &rep
|
||||
}
|
||||
|
||||
func BuildIndexedSequenceSlice(seqs obiseq.BioSequenceSlice, keys []string) IndexedSequenceSlice {
|
||||
indices := make([]map[interface{}]*obiutils.Set[int], len(keys))
|
||||
|
||||
for i, k := range keys {
|
||||
idx := make(map[interface{}]*obiutils.Set[int])
|
||||
|
||||
for j, seq := range seqs {
|
||||
|
||||
if value, ok := seq.GetAttribute(k); ok {
|
||||
goods, ok := idx[value]
|
||||
if !ok {
|
||||
goods = obiutils.NewSet[int]()
|
||||
idx[value] = goods
|
||||
}
|
||||
|
||||
goods.Add(j)
|
||||
}
|
||||
}
|
||||
|
||||
indices[i] = idx
|
||||
}
|
||||
|
||||
return IndexedSequenceSlice{seqs, indices}
|
||||
}
|
||||
|
||||
func MakeJoinWorker(by []string, index IndexedSequenceSlice, updateId, updateSequence, updateQuality bool) obiseq.SeqWorker {
|
||||
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
var ok bool
|
||||
|
||||
keys := make([]interface{}, len(by))
|
||||
|
||||
for i, v := range by {
|
||||
keys[i], ok = sequence.GetAttribute(v)
|
||||
if !ok {
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
}
|
||||
|
||||
join_with := index.Get(keys...)
|
||||
|
||||
rep := obiseq.MakeBioSequenceSlice(join_with.Len())
|
||||
|
||||
if join_with.Len() == 0 {
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
||||
for i, v := range *join_with {
|
||||
rep[i] = sequence.Copy()
|
||||
annot := rep[i].Annotations()
|
||||
new_annot := v.Annotations()
|
||||
|
||||
for k, v := range new_annot {
|
||||
annot[k] = v
|
||||
}
|
||||
|
||||
if updateId {
|
||||
rep[i].SetId(v.Id())
|
||||
}
|
||||
if updateSequence && len(v.Sequence()) > 0 {
|
||||
rep[i].SetSequence(v.Sequence())
|
||||
}
|
||||
if updateQuality && len(v.Qualities()) > 0 {
|
||||
rep[i].SetQualities(v.Qualities())
|
||||
}
|
||||
}
|
||||
|
||||
return rep, nil
|
||||
}
|
||||
|
||||
return obiseq.SeqWorker(f)
|
||||
}
|
||||
|
||||
func CLIJoinSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
|
||||
data_iter, err := obiformats.ReadSequencesFromFile(CLIJoinWith())
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot read the data file to merge with: %s %v", CLIJoinWith(), err)
|
||||
}
|
||||
|
||||
data := data_iter.Load()
|
||||
|
||||
keys := CLIBy()
|
||||
|
||||
index := BuildIndexedSequenceSlice(data, keys.Right)
|
||||
|
||||
worker := MakeJoinWorker(keys.Left, index, CLIUpdateId(), CLIUpdateSequence(), CLIUpdateQuality())
|
||||
|
||||
iterator = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
|
||||
|
||||
return iterator
|
||||
}
|
90
pkg/obitools/obijoin/options.go
Normal file
90
pkg/obitools/obijoin/options.go
Normal file
@ -0,0 +1,90 @@
|
||||
package obijoin
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _by = []string{}
|
||||
var _join = ""
|
||||
var _UpdateID = false
|
||||
var _UpdateSequence = false
|
||||
var _UpdateQuality = false
|
||||
|
||||
type By struct {
|
||||
Left []string
|
||||
Right []string
|
||||
}
|
||||
|
||||
func JoinOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
options.StringSliceVar(&_by, "by", 1, 1,
|
||||
options.Alias("b"),
|
||||
options.Description("to declare join keys."))
|
||||
|
||||
options.StringVar(&_join, "join-with", _join,
|
||||
options.Alias("j"),
|
||||
options.Description("file name of the file to join with."),
|
||||
options.Required("You must provide a file name to join with."))
|
||||
|
||||
options.BoolVar(&_UpdateID, "update-id", _UpdateID,
|
||||
options.Alias("i"),
|
||||
options.Description("Update the sequence IDs in the joined file."))
|
||||
|
||||
options.BoolVar(&_UpdateSequence, "update-sequence", _UpdateSequence,
|
||||
options.Alias("s"),
|
||||
options.Description("Update the sequence in the joined file."))
|
||||
|
||||
options.BoolVar(&_UpdateQuality, "update-quality", _UpdateQuality,
|
||||
options.Alias("q"),
|
||||
options.Description("Update the quality in the joined file."))
|
||||
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
// the obipcr command
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
JoinOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIBy() By {
|
||||
if len(_by) == 0 {
|
||||
return By{
|
||||
Left: []string{"id"},
|
||||
Right: []string{"id"},
|
||||
}
|
||||
}
|
||||
|
||||
left := make([]string, len(_by))
|
||||
right := make([]string, len(_by))
|
||||
|
||||
for i, v := range _by {
|
||||
vals := strings.Split(v, "=")
|
||||
left[i] = vals[0]
|
||||
right[i] = vals[0]
|
||||
if len(vals) > 1 {
|
||||
right[i] = vals[1]
|
||||
}
|
||||
}
|
||||
|
||||
return By{Left: left, Right: right}
|
||||
}
|
||||
|
||||
func CLIJoinWith() string {
|
||||
return _join
|
||||
}
|
||||
|
||||
func CLIUpdateId() bool {
|
||||
return _UpdateID
|
||||
}
|
||||
|
||||
func CLIUpdateSequence() bool {
|
||||
return _UpdateSequence
|
||||
}
|
||||
|
||||
func CLIUpdateQuality() bool {
|
||||
return _UpdateQuality
|
||||
}
|
Reference in New Issue
Block a user