diff --git a/Release-notes.md b/Release-notes.md index 9ec0510..74e22c7 100644 --- a/Release-notes.md +++ b/Release-notes.md @@ -20,6 +20,24 @@ ### New features +- Adds a new `obijoin` utility to join information contained in a sequence + file with that contained in another sequence or CSV file. The command allows + you to specify the names of the keys in the main sequence file and in the + secondary data file that will be used to perform the join operation. + +- Adds a new tool `obidemerge` to demerge a `merge_xxx` slot by recreating the + multiple identical sequences having the slot `xxx` recreated with its initial + value and the sequence count set to the number of occurences refered in the + `merge_xxx` slot. During the operation, the `merge_xxx` slot is removed. + +- Adds CSV as one of the input format for every obitools command. To encode + sequence the CSV file must includes a column named `sequence` and another + column named `id`. An extra column named `qualities` can be added to specify + the quality scores of the sequence following the same ascii encoding than the + fastq format. All the other columns will be considered as annotations and will + be interpreted as JSON objects encoding potentially for atomic values. If a + calumn value can not be decoded as JSON it will be considered as a string. + - A new option **--version** has been added to every obitools command. It will print the version of the command. diff --git a/go.mod b/go.mod index e545222..81aed9d 100644 --- a/go.mod +++ b/go.mod @@ -32,6 +32,7 @@ require ( github.com/klauspost/cpuid/v2 v2.0.9 // indirect github.com/kr/pretty v0.3.0 // indirect github.com/kr/text v0.2.0 // indirect + github.com/montanaflynn/stats v0.7.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rogpeppe/go-internal v1.6.1 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect diff --git a/go.sum b/go.sum index 6c2eae6..6ae044b 100644 --- a/go.sum +++ b/go.sum @@ -58,6 +58,8 @@ github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZ github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= +github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= +github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0= github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 61e544b..fd801d0 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "2bc540d" +var _Commit = "a365bb6" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/merge.go b/pkg/obiseq/merge.go index 70824e4..26163ae 100644 --- a/pkg/obiseq/merge.go +++ b/pkg/obiseq/merge.go @@ -49,6 +49,8 @@ func MakeStatsOnDescription(descriptor string) StatsOnDescription { } } +var _merge_prefix = "merged_" + // StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute. // // Parameters: @@ -57,7 +59,7 @@ func MakeStatsOnDescription(descriptor string) StatsOnDescription { // Return type: // - string func StatsOnSlotName(key string) string { - return "merged_" + key + return _merge_prefix + key } // HasStatsOn tests if the sequence has already a slot summarizing statistics of occurrence for a given attribute. @@ -115,12 +117,12 @@ func (sequence *BioSequence) StatsOn(desc StatsOnDescription, na string) StatsOn } } default: - stats = make(StatsOnValues, 100) + stats = make(StatsOnValues, 10) annotations[mkey] = stats newstat = true } } else { - stats = make(StatsOnValues, 100) + stats = make(StatsOnValues, 10) annotations[mkey] = stats newstat = true } @@ -165,7 +167,7 @@ func (sequence *BioSequence) StatsPlusOne(desc StatsOnDescription, toAdd *BioSeq log.Fatalf("Trying to make stats on a float value (%v : %T)", value, value) } default: - log.Fatalf("Trying to make stats on a none string, integer or boolean value (%v : %T)", value, value) + log.Fatalf("Trying to make stats not on a string, integer or boolean value (%v : %T)", value, value) } retval = true } @@ -234,7 +236,7 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool if tomerge.HasAnnotation() { ma := tomerge.Annotations() for k, va := range annotations { - if !strings.HasPrefix(k, "merged_") { + if !strings.HasPrefix(k, _merge_prefix) { vm, ok := ma[k] if ok { switch vm := vm.(type) { @@ -255,7 +257,7 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool } } else { for k := range annotations { - if !strings.HasPrefix(k, "merged_") { + if !strings.HasPrefix(k, _merge_prefix) { delete(annotations, k) } } diff --git a/pkg/obistats/algo.go b/pkg/obistats/algo.go new file mode 100644 index 0000000..0aeeb92 --- /dev/null +++ b/pkg/obistats/algo.go @@ -0,0 +1,113 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +// Miscellaneous helper algorithms + +import ( + "fmt" +) + +func maxint(a, b int) int { + if a > b { + return a + } + return b +} + +func minint(a, b int) int { + if a < b { + return a + } + return b +} + +func sumint(xs []int) int { + sum := 0 + for _, x := range xs { + sum += x + } + return sum +} + +// bisect returns an x in [low, high] such that |f(x)| <= tolerance +// using the bisection method. +// +// f(low) and f(high) must have opposite signs. +// +// If f does not have a root in this interval (e.g., it is +// discontiguous), this returns the X of the apparent discontinuity +// and false. +func bisect(f func(float64) float64, low, high, tolerance float64) (float64, bool) { + flow, fhigh := f(low), f(high) + if -tolerance <= flow && flow <= tolerance { + return low, true + } + if -tolerance <= fhigh && fhigh <= tolerance { + return high, true + } + if mathSign(flow) == mathSign(fhigh) { + panic(fmt.Sprintf("root of f is not bracketed by [low, high]; f(%g)=%g f(%g)=%g", low, flow, high, fhigh)) + } + for { + mid := (high + low) / 2 + fmid := f(mid) + if -tolerance <= fmid && fmid <= tolerance { + return mid, true + } + if mid == high || mid == low { + return mid, false + } + if mathSign(fmid) == mathSign(flow) { + low = mid + flow = fmid + } else { + high = mid + fhigh = fmid + } + } +} + +// bisectBool implements the bisection method on a boolean function. +// It returns x1, x2 ∈ [low, high], x1 < x2 such that f(x1) != f(x2) +// and x2 - x1 <= xtol. +// +// If f(low) == f(high), it panics. +func bisectBool(f func(float64) bool, low, high, xtol float64) (x1, x2 float64) { + flow, fhigh := f(low), f(high) + if flow == fhigh { + panic(fmt.Sprintf("root of f is not bracketed by [low, high]; f(%g)=%v f(%g)=%v", low, flow, high, fhigh)) + } + for { + if high-low <= xtol { + return low, high + } + mid := (high + low) / 2 + if mid == high || mid == low { + return low, high + } + fmid := f(mid) + if fmid == flow { + low = mid + flow = fmid + } else { + high = mid + fhigh = fmid + } + } +} + +// series returns the sum of the series f(0), f(1), ... +// +// This implementation is fast, but subject to round-off error. +func series(f func(float64) float64) float64 { + y, yp := 0.0, 1.0 + for n := 0.0; y != yp; n++ { + yp = y + y += f(n) + } + return y +} diff --git a/pkg/obistats/beta.go b/pkg/obistats/beta.go new file mode 100644 index 0000000..ac91e75 --- /dev/null +++ b/pkg/obistats/beta.go @@ -0,0 +1,93 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import "math" + +func lgamma(x float64) float64 { + y, _ := math.Lgamma(x) + return y +} + +// mathBeta returns the value of the complete beta function B(a, b). +func mathBeta(a, b float64) float64 { + // B(x,y) = Γ(x)Γ(y) / Γ(x+y) + return math.Exp(lgamma(a) + lgamma(b) - lgamma(a+b)) +} + +// mathBetaInc returns the value of the regularized incomplete beta +// function Iₓ(a, b). +// +// This is not to be confused with the "incomplete beta function", +// which can be computed as BetaInc(x, a, b)*Beta(a, b). +// +// If x < 0 or x > 1, returns NaN. +func mathBetaInc(x, a, b float64) float64 { + // Based on Numerical Recipes in C, section 6.4. This uses the + // continued fraction definition of I: + // + // (xᵃ*(1-x)ᵇ)/(a*B(a,b)) * (1/(1+(d₁/(1+(d₂/(1+...)))))) + // + // where B(a,b) is the beta function and + // + // d_{2m+1} = -(a+m)(a+b+m)x/((a+2m)(a+2m+1)) + // d_{2m} = m(b-m)x/((a+2m-1)(a+2m)) + if x < 0 || x > 1 { + return math.NaN() + } + bt := 0.0 + if 0 < x && x < 1 { + // Compute the coefficient before the continued + // fraction. + bt = math.Exp(lgamma(a+b) - lgamma(a) - lgamma(b) + + a*math.Log(x) + b*math.Log(1-x)) + } + if x < (a+1)/(a+b+2) { + // Compute continued fraction directly. + return bt * betacf(x, a, b) / a + } + // Compute continued fraction after symmetry transform. + return 1 - bt*betacf(1-x, b, a)/b +} + +// betacf is the continued fraction component of the regularized +// incomplete beta function Iₓ(a, b). +func betacf(x, a, b float64) float64 { + const maxIterations = 200 + const epsilon = 3e-14 + + raiseZero := func(z float64) float64 { + if math.Abs(z) < math.SmallestNonzeroFloat64 { + return math.SmallestNonzeroFloat64 + } + return z + } + + c := 1.0 + d := 1 / raiseZero(1-(a+b)*x/(a+1)) + h := d + for m := 1; m <= maxIterations; m++ { + mf := float64(m) + + // Even step of the recurrence. + numer := mf * (b - mf) * x / ((a + 2*mf - 1) * (a + 2*mf)) + d = 1 / raiseZero(1+numer*d) + c = raiseZero(1 + numer/c) + h *= d * c + + // Odd step of the recurrence. + numer = -(a + mf) * (a + b + mf) * x / ((a + 2*mf) * (a + 2*mf + 1)) + d = 1 / raiseZero(1+numer*d) + c = raiseZero(1 + numer/c) + hfac := d * c + h *= hfac + + if math.Abs(hfac-1) < epsilon { + return h + } + } + panic("betainc: a or b too big; failed to converge") +} diff --git a/pkg/obistats/data.go b/pkg/obistats/data.go new file mode 100644 index 0000000..138650a --- /dev/null +++ b/pkg/obistats/data.go @@ -0,0 +1,225 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import ( + "fmt" +) + +// A Collection is a collection of benchmark results. +type Collection struct { + // Configs, Groups, and Units give the set of configs, + // groups, and units from the keys in Stats in an order + // meant to match the order the benchmarks were read in. + Configs, Groups, Units []string + + // Benchmarks gives the set of benchmarks from the keys in + // Stats by group in an order meant to match the order + // benchmarks were read in. + Benchmarks map[string][]string + + // Metrics holds the accumulated metrics for each key. + Metrics map[Key]*Metrics + + // DeltaTest is the test to use to decide if a change is significant. + // If nil, it defaults to UTest. + DeltaTest DeltaTest + + // Alpha is the p-value cutoff to report a change as significant. + // If zero, it defaults to 0.05. + Alpha float64 + + // AddGeoMean specifies whether to add a line to the table + // showing the geometric mean of all the benchmark results. + AddGeoMean bool + + // SplitBy specifies the labels to split results by. + // By default, results will only be split by full name. + SplitBy []string + + // Order specifies the row display order for this table. + // If Order is nil, the table rows are printed in order of + // first appearance in the input. + Order Order +} + +// A Key identifies one metric (e.g., "ns/op", "B/op") from one +// benchmark (function name sans "Benchmark" prefix) and optional +// group in one configuration (input file name). +type Key struct { + Config, Group, Benchmark, Unit string +} + +// A Metrics holds the measurements of a single metric +// (for example, ns/op or MB/s) +// for all runs of a particular benchmark. +type Metrics struct { + Unit string // unit being measured + Values []float64 // measured values + RValues []float64 // Values with outliers removed + Min float64 // min of RValues + Mean float64 // mean of RValues + Max float64 // max of RValues +} + +// FormatMean formats m.Mean using scaler. +func (m *Metrics) FormatMean(scaler Scaler) string { + var s string + if scaler != nil { + s = scaler(m.Mean) + } else { + s = fmt.Sprint(m.Mean) + } + return s +} + +// FormatDiff computes and formats the percent variation of max and min compared to mean. +// If b.Mean or b.Max is zero, FormatDiff returns an empty string. +func (m *Metrics) FormatDiff() string { + if m.Mean == 0 || m.Max == 0 { + return "" + } + diff := 1 - m.Min/m.Mean + if d := m.Max/m.Mean - 1; d > diff { + diff = d + } + + return fmt.Sprintf("±%.0f%%", diff*100.0) +} + +// Format returns a textual formatting of "Mean ±Diff" using scaler. +func (m *Metrics) Format(scaler Scaler) string { + if m.Unit == "" { + return "" + } + mean := m.FormatMean(scaler) + diff := m.FormatDiff() + if diff == "" { + return mean + " " + } + return fmt.Sprintf("%s %3s", mean, diff) +} + +// computeStats updates the derived statistics in m from the raw +// samples in m.Values. +func (m *Metrics) computeStats() { + // Discard outliers. + values := Sample{Xs: m.Values} + q1, q3 := values.Percentile(0.25), values.Percentile(0.75) + lo, hi := q1-1.5*(q3-q1), q3+1.5*(q3-q1) + for _, value := range m.Values { + if lo <= value && value <= hi { + m.RValues = append(m.RValues, value) + } + } + + // Compute statistics of remaining data. + m.Min, m.Max = Bounds(m.RValues) + m.Mean = Mean(m.RValues) +} + +// addMetrics returns the metrics with the given key from c, +// creating a new one if needed. +func (c *Collection) addMetrics(key Key) *Metrics { + if c.Metrics == nil { + c.Metrics = make(map[Key]*Metrics) + } + if stat, ok := c.Metrics[key]; ok { + return stat + } + + addString := func(strings *[]string, add string) { + for _, s := range *strings { + if s == add { + return + } + } + *strings = append(*strings, add) + } + addString(&c.Configs, key.Config) + addString(&c.Groups, key.Group) + if c.Benchmarks == nil { + c.Benchmarks = make(map[string][]string) + } + benchmarks := c.Benchmarks[key.Group] + addString(&benchmarks, key.Benchmark) + c.Benchmarks[key.Group] = benchmarks + addString(&c.Units, key.Unit) + m := &Metrics{Unit: key.Unit} + c.Metrics[key] = m + return m +} + +// // AddFile adds the benchmark results in the formatted data +// // (read from the reader r) to the named configuration. +// func (c *Collection) AddFile(config string, f io.Reader) error { +// c.Configs = append(c.Configs, config) +// key := Key{Config: config} +// br := benchfmt.NewReader(f) +// for br.Next() { +// c.addResult(key, br.Result()) +// } +// return br.Err() +// } + +// // AddData adds the benchmark results in the formatted data +// // (read from the reader r) to the named configuration. +// func (c *Collection) AddData(config string, data []byte) error { +// return c.AddFile(config, bytes.NewReader(data)) +// } + +// AddResults adds the benchmark results to the named configuration. +// func (c *Collection) AddResults(config string, results []*benchfmt.Result) { +// c.Configs = append(c.Configs, config) +// key := Key{Config: config} +// for _, r := range results { +// c.addResult(key, r) +// } +// } + +// func (c *Collection) addResult(key Key, r *benchfmt.Result) { +// f := strings.Fields(r.Content) +// if len(f) < 4 { +// return +// } +// name := f[0] +// if !strings.HasPrefix(name, "Benchmark") { +// return +// } +// name = strings.TrimPrefix(name, "Benchmark") +// n, _ := strconv.Atoi(f[1]) +// if n == 0 { +// return +// } +// key.Group = c.makeGroup(r) +// key.Benchmark = name +// for i := 2; i+2 <= len(f); i += 2 { +// val, err := strconv.ParseFloat(f[i], 64) +// if err != nil { +// continue +// } +// key.Unit = f[i+1] +// m := c.addMetrics(key) +// m.Values = append(m.Values, val) +// } +// } + +// func (c *Collection) makeGroup(r *benchfmt.Result) string { +// var out string +// for _, s := range c.SplitBy { +// v := r.NameLabels[s] +// if v == "" { +// v = r.Labels[s] +// } +// if v != "" { +// if out != "" { +// out = out + " " +// } +// out += fmt.Sprintf("%s:%s", s, v) +// } +// } +// return out +// } diff --git a/pkg/obistats/delta.go b/pkg/obistats/delta.go new file mode 100644 index 0000000..9ee4c4d --- /dev/null +++ b/pkg/obistats/delta.go @@ -0,0 +1,54 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import "errors" + +// A DeltaTest compares the old and new metrics and returns the +// expected probability that they are drawn from the same distribution. +// +// If a probability cannot be computed, the DeltaTest returns an +// error explaining why. Common errors include ErrSamplesEqual +// (all samples are equal), ErrSampleSize (there aren't enough samples), +// and ErrZeroVariance (the sample has zero variance). +// +// As a special case, the missing test NoDeltaTest returns -1, nil. +type DeltaTest func(old, new *Metrics) (float64, error) + +// Errors returned by DeltaTest. +var ( + ErrSamplesEqual = errors.New("all equal") + ErrSampleSize = errors.New("too few samples") + ErrZeroVariance = errors.New("zero variance") + ErrMismatchedSamples = errors.New("samples have different lengths") +) + +// NoDeltaTest applies no delta test; it returns -1, nil. +func NoDeltaTest(old, new *Metrics) (pval float64, err error) { + return -1, nil +} + +// TTest is a DeltaTest using the two-sample Welch t-test. +func TTest(old, new *Metrics) (pval float64, err error) { + t, err := TwoSampleWelchTTest( + Sample{Xs: old.RValues}, + Sample{Xs: new.RValues}, + LocationDiffers, + ) + if err != nil { + return -1, err + } + return t.P, nil +} + +// UTest is a DeltaTest using the Mann-Whitney U test. +func UTest(old, new *Metrics) (pval float64, err error) { + u, err := MannWhitneyUTest(old.RValues, new.RValues, LocationDiffers) + if err != nil { + return -1, err + } + return u.P, nil +} diff --git a/pkg/obistats/mannwhitney.go b/pkg/obistats/mannwhitney.go new file mode 100644 index 0000000..a840892 --- /dev/null +++ b/pkg/obistats/mannwhitney.go @@ -0,0 +1,275 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import ( + "math" + "sort" +) + +// A LocationHypothesis specifies the alternative hypothesis of a +// location test such as a t-test or a Mann-Whitney U-test. The +// default (zero) value is to test against the alternative hypothesis +// that they differ. +type LocationHypothesis int + +//go:generate stringer -type LocationHypothesis + +const ( + // LocationLess specifies the alternative hypothesis that the + // location of the first sample is less than the second. This + // is a one-tailed test. + LocationLess LocationHypothesis = -1 + + // LocationDiffers specifies the alternative hypothesis that + // the locations of the two samples are not equal. This is a + // two-tailed test. + LocationDiffers LocationHypothesis = 0 + + // LocationGreater specifies the alternative hypothesis that + // the location of the first sample is greater than the + // second. This is a one-tailed test. + LocationGreater LocationHypothesis = 1 +) + +// A MannWhitneyUTestResult is the result of a Mann-Whitney U-test. +type MannWhitneyUTestResult struct { + // N1 and N2 are the sizes of the input samples. + N1, N2 int + + // U is the value of the Mann-Whitney U statistic for this + // test, generalized by counting ties as 0.5. + // + // Given the Cartesian product of the two samples, this is the + // number of pairs in which the value from the first sample is + // greater than the value of the second, plus 0.5 times the + // number of pairs where the values from the two samples are + // equal. Hence, U is always an integer multiple of 0.5 (it is + // a whole integer if there are no ties) in the range [0, N1*N2]. + // + // U statistics always come in pairs, depending on which + // sample is "first". The mirror U for the other sample can be + // calculated as N1*N2 - U. + // + // There are many equivalent statistics with slightly + // different definitions. The Wilcoxon (1945) W statistic + // (generalized for ties) is U + (N1(N1+1))/2. It is also + // common to use 2U to eliminate the half steps and Smid + // (1956) uses N1*N2 - 2U to additionally center the + // distribution. + U float64 + + // AltHypothesis specifies the alternative hypothesis tested + // by this test against the null hypothesis that there is no + // difference in the locations of the samples. + AltHypothesis LocationHypothesis + + // P is the p-value of the Mann-Whitney test for the given + // null hypothesis. + P float64 +} + +// MannWhitneyExactLimit gives the largest sample size for which the +// exact U distribution will be used for the Mann-Whitney U-test. +// +// Using the exact distribution is necessary for small sample sizes +// because the distribution is highly irregular. However, computing +// the distribution for large sample sizes is both computationally +// expensive and unnecessary because it quickly approaches a normal +// approximation. Computing the distribution for two 50 value samples +// takes a few milliseconds on a 2014 laptop. +var MannWhitneyExactLimit = 50 + +// MannWhitneyTiesExactLimit gives the largest sample size for which +// the exact U distribution will be used for the Mann-Whitney U-test +// in the presence of ties. +// +// Computing this distribution is more expensive than computing the +// distribution without ties, so this is set lower. Computing this +// distribution for two 25 value samples takes about ten milliseconds +// on a 2014 laptop. +var MannWhitneyTiesExactLimit = 25 + +// MannWhitneyUTest performs a Mann-Whitney U-test [1,2] of the null +// hypothesis that two samples come from the same population against +// the alternative hypothesis that one sample tends to have larger or +// smaller values than the other. +// +// This is similar to a t-test, but unlike the t-test, the +// Mann-Whitney U-test is non-parametric (it does not assume a normal +// distribution). It has very slightly lower efficiency than the +// t-test on normal distributions. +// +// Computing the exact U distribution is expensive for large sample +// sizes, so this uses a normal approximation for sample sizes larger +// than MannWhitneyExactLimit if there are no ties or +// MannWhitneyTiesExactLimit if there are ties. This normal +// approximation uses both the tie correction and the continuity +// correction. +// +// This can fail with ErrSampleSize if either sample is empty or +// ErrSamplesEqual if all sample values are equal. +// +// This is also known as a Mann-Whitney-Wilcoxon test and is +// equivalent to the Wilcoxon rank-sum test, though the Wilcoxon +// rank-sum test differs in nomenclature. +// +// [1] Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of +// Whether one of Two Random Variables is Stochastically Larger than +// the Other". Annals of Mathematical Statistics 18 (1): 50–60. +// +// [2] Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer". +// Journal of the American Statistical Association 61 (315): 772-787. +func MannWhitneyUTest(x1, x2 []float64, alt LocationHypothesis) (*MannWhitneyUTestResult, error) { + n1, n2 := len(x1), len(x2) + if n1 == 0 || n2 == 0 { + return nil, ErrSampleSize + } + + // Compute the U statistic and tie vector T. + x1 = append([]float64(nil), x1...) + x2 = append([]float64(nil), x2...) + sort.Float64s(x1) + sort.Float64s(x2) + merged, labels := labeledMerge(x1, x2) + + R1 := 0.0 + T, hasTies := []int{}, false + for i := 0; i < len(merged); { + rank1, nx1, v1 := i+1, 0, merged[i] + // Consume samples that tie this sample (including itself). + for ; i < len(merged) && merged[i] == v1; i++ { + if labels[i] == 1 { + nx1++ + } + } + // Assign all tied samples the average rank of the + // samples, where merged[0] has rank 1. + if nx1 != 0 { + rank := float64(i+rank1) / 2 + R1 += rank * float64(nx1) + } + T = append(T, i-rank1+1) + if i > rank1 { + hasTies = true + } + } + U1 := R1 - float64(n1*(n1+1))/2 + + // Compute the smaller of U1 and U2 + U2 := float64(n1*n2) - U1 + Usmall := math.Min(U1, U2) + + var p float64 + if !hasTies && n1 <= MannWhitneyExactLimit && n2 <= MannWhitneyExactLimit || + hasTies && n1 <= MannWhitneyTiesExactLimit && n2 <= MannWhitneyTiesExactLimit { + // Use exact U distribution. U1 will be an integer. + if len(T) == 1 { + // All values are equal. Test is meaningless. + return nil, ErrSamplesEqual + } + + dist := UDist{N1: n1, N2: n2, T: T} + switch alt { + case LocationDiffers: + if U1 == U2 { + // The distribution is symmetric about + // Usmall. Since the distribution is + // discrete, the CDF is discontinuous + // and if simply double CDF(Usmall), + // we'll double count the + // (non-infinitesimal) probability + // mass at Usmall. What we want is + // just the integral of the whole CDF, + // which is 1. + p = 1 + } else { + p = dist.CDF(Usmall) * 2 + } + + case LocationLess: + p = dist.CDF(U1) + + case LocationGreater: + p = 1 - dist.CDF(U1-1) + } + } else { + // Use normal approximation (with tie and continuity + // correction). + t := tieCorrection(T) + N := float64(n1 + n2) + μU := float64(n1*n2) / 2 + σU := math.Sqrt(float64(n1*n2) * ((N + 1) - t/(N*(N-1))) / 12) + if σU == 0 { + return nil, ErrSamplesEqual + } + numer := U1 - μU + // Perform continuity correction. + switch alt { + case LocationDiffers: + numer -= mathSign(numer) * 0.5 + case LocationLess: + numer += 0.5 + case LocationGreater: + numer -= 0.5 + } + z := numer / σU + switch alt { + case LocationDiffers: + p = 2 * math.Min(StdNormal.CDF(z), 1-StdNormal.CDF(z)) + case LocationLess: + p = StdNormal.CDF(z) + case LocationGreater: + p = 1 - StdNormal.CDF(z) + } + } + + return &MannWhitneyUTestResult{N1: n1, N2: n2, U: U1, + AltHypothesis: alt, P: p}, nil +} + +// labeledMerge merges sorted lists x1 and x2 into sorted list merged. +// labels[i] is 1 or 2 depending on whether merged[i] is a value from +// x1 or x2, respectively. +func labeledMerge(x1, x2 []float64) (merged []float64, labels []byte) { + merged = make([]float64, len(x1)+len(x2)) + labels = make([]byte, len(x1)+len(x2)) + + i, j, o := 0, 0, 0 + for i < len(x1) && j < len(x2) { + if x1[i] < x2[j] { + merged[o] = x1[i] + labels[o] = 1 + i++ + } else { + merged[o] = x2[j] + labels[o] = 2 + j++ + } + o++ + } + for ; i < len(x1); i++ { + merged[o] = x1[i] + labels[o] = 1 + o++ + } + for ; j < len(x2); j++ { + merged[o] = x2[j] + labels[o] = 2 + o++ + } + return +} + +// tieCorrection computes the tie correction factor Σ_j (t_j³ - t_j) +// where t_j is the number of ties in the j'th rank. +func tieCorrection(ties []int) float64 { + t := 0 + for _, tie := range ties { + t += tie*tie*tie - tie + } + return float64(t) +} diff --git a/pkg/obistats/mathx.go b/pkg/obistats/mathx.go new file mode 100644 index 0000000..581e2fb --- /dev/null +++ b/pkg/obistats/mathx.go @@ -0,0 +1,79 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import "math" + +var inf = math.Inf(1) +var nan = math.NaN() + +// mathSign returns the sign of x: -1 if x < 0, 0 if x == 0, 1 if x > 0. +// If x is NaN, it returns NaN. +func mathSign(x float64) float64 { + if x == 0 { + return 0 + } else if x < 0 { + return -1 + } else if x > 0 { + return 1 + } + return nan +} + +const smallFactLimit = 20 // 20! => 62 bits +var smallFact [smallFactLimit + 1]int64 + +func init() { + smallFact[0] = 1 + fact := int64(1) + for n := int64(1); n <= smallFactLimit; n++ { + fact *= n + smallFact[n] = fact + } +} + +// mathChoose returns the binomial coefficient of n and k. +func mathChoose(n, k int) float64 { + if k == 0 || k == n { + return 1 + } + if k < 0 || n < k { + return 0 + } + if n <= smallFactLimit { // Implies k <= smallFactLimit + // It's faster to do several integer multiplications + // than it is to do an extra integer division. + // Remarkably, this is also faster than pre-computing + // Pascal's triangle (presumably because this is very + // cache efficient). + numer := int64(1) + for n1 := int64(n - (k - 1)); n1 <= int64(n); n1++ { + numer *= n1 + } + denom := smallFact[k] + return float64(numer / denom) + } + + return math.Exp(lchoose(n, k)) +} + +// mathLchoose returns math.Log(mathChoose(n, k)). +func mathLchoose(n, k int) float64 { + if k == 0 || k == n { + return 0 + } + if k < 0 || n < k { + return math.NaN() + } + return lchoose(n, k) +} + +func lchoose(n, k int) float64 { + a, _ := math.Lgamma(float64(n + 1)) + b, _ := math.Lgamma(float64(k + 1)) + c, _ := math.Lgamma(float64(n - k + 1)) + return a - b - c +} diff --git a/pkg/obistats/normaldist.go b/pkg/obistats/normaldist.go new file mode 100644 index 0000000..a19c115 --- /dev/null +++ b/pkg/obistats/normaldist.go @@ -0,0 +1,142 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import ( + "math" + "math/rand" +) + +// NormalDist is a normal (Gaussian) distribution with mean Mu and +// standard deviation Sigma. +type NormalDist struct { + Mu, Sigma float64 +} + +// StdNormal is the standard normal distribution (Mu = 0, Sigma = 1) +var StdNormal = NormalDist{0, 1} + +// 1/sqrt(2 * pi) +const invSqrt2Pi = 0.39894228040143267793994605993438186847585863116493465766592583 + +func (n NormalDist) PDF(x float64) float64 { + z := x - n.Mu + return math.Exp(-z*z/(2*n.Sigma*n.Sigma)) * invSqrt2Pi / n.Sigma +} + +func (n NormalDist) pdfEach(xs []float64) []float64 { + res := make([]float64, len(xs)) + if n.Mu == 0 && n.Sigma == 1 { + // Standard normal fast path + for i, x := range xs { + res[i] = math.Exp(-x*x/2) * invSqrt2Pi + } + } else { + a := -1 / (2 * n.Sigma * n.Sigma) + b := invSqrt2Pi / n.Sigma + for i, x := range xs { + z := x - n.Mu + res[i] = math.Exp(z*z*a) * b + } + } + return res +} + +func (n NormalDist) CDF(x float64) float64 { + return math.Erfc(-(x-n.Mu)/(n.Sigma*math.Sqrt2)) / 2 +} + +func (n NormalDist) cdfEach(xs []float64) []float64 { + res := make([]float64, len(xs)) + a := 1 / (n.Sigma * math.Sqrt2) + for i, x := range xs { + res[i] = math.Erfc(-(x-n.Mu)*a) / 2 + } + return res +} + +func (n NormalDist) InvCDF(p float64) (x float64) { + // This is based on Peter John Acklam's inverse normal CDF + // algorithm: http://home.online.no/~pjacklam/notes/invnorm/ + const ( + a1 = -3.969683028665376e+01 + a2 = 2.209460984245205e+02 + a3 = -2.759285104469687e+02 + a4 = 1.383577518672690e+02 + a5 = -3.066479806614716e+01 + a6 = 2.506628277459239e+00 + + b1 = -5.447609879822406e+01 + b2 = 1.615858368580409e+02 + b3 = -1.556989798598866e+02 + b4 = 6.680131188771972e+01 + b5 = -1.328068155288572e+01 + + c1 = -7.784894002430293e-03 + c2 = -3.223964580411365e-01 + c3 = -2.400758277161838e+00 + c4 = -2.549732539343734e+00 + c5 = 4.374664141464968e+00 + c6 = 2.938163982698783e+00 + + d1 = 7.784695709041462e-03 + d2 = 3.224671290700398e-01 + d3 = 2.445134137142996e+00 + d4 = 3.754408661907416e+00 + + plow = 0.02425 + phigh = 1 - plow + ) + + if p < 0 || p > 1 { + return nan + } else if p == 0 { + return -inf + } else if p == 1 { + return inf + } + + if p < plow { + // Rational approximation for lower region. + q := math.Sqrt(-2 * math.Log(p)) + x = (((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q + c6) / + ((((d1*q+d2)*q+d3)*q+d4)*q + 1) + } else if phigh < p { + // Rational approximation for upper region. + q := math.Sqrt(-2 * math.Log(1-p)) + x = -(((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q + c6) / + ((((d1*q+d2)*q+d3)*q+d4)*q + 1) + } else { + // Rational approximation for central region. + q := p - 0.5 + r := q * q + x = (((((a1*r+a2)*r+a3)*r+a4)*r+a5)*r + a6) * q / + (((((b1*r+b2)*r+b3)*r+b4)*r+b5)*r + 1) + } + + // Refine approximation. + e := 0.5*math.Erfc(-x/math.Sqrt2) - p + u := e * math.Sqrt(2*math.Pi) * math.Exp(x*x/2) + x = x - u/(1+x*u/2) + + // Adjust from standard normal. + return x*n.Sigma + n.Mu +} + +func (n NormalDist) Rand(r *rand.Rand) float64 { + var x float64 + if r == nil { + x = rand.NormFloat64() + } else { + x = r.NormFloat64() + } + return x*n.Sigma + n.Mu +} + +func (n NormalDist) Bounds() (float64, float64) { + const stddevs = 3 + return n.Mu - stddevs*n.Sigma, n.Mu + stddevs*n.Sigma +} diff --git a/pkg/obistats/sample.go b/pkg/obistats/sample.go new file mode 100644 index 0000000..3d23e70 --- /dev/null +++ b/pkg/obistats/sample.go @@ -0,0 +1,341 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import ( + "math" + "sort" +) + +// Sample is a collection of possibly weighted data points. +type Sample struct { + // Xs is the slice of sample values. + Xs []float64 + + // Weights[i] is the weight of sample Xs[i]. If Weights is + // nil, all Xs have weight 1. Weights must have the same + // length of Xs and all values must be non-negative. + Weights []float64 + + // Sorted indicates that Xs is sorted in ascending order. + Sorted bool +} + +// Bounds returns the minimum and maximum values of xs. +func Bounds(xs []float64) (min float64, max float64) { + if len(xs) == 0 { + return math.NaN(), math.NaN() + } + min, max = xs[0], xs[0] + for _, x := range xs { + if x < min { + min = x + } + if x > max { + max = x + } + } + return +} + +// Bounds returns the minimum and maximum values of the Sample. +// +// If the Sample is weighted, this ignores samples with zero weight. +// +// This is constant time if s.Sorted and there are no zero-weighted +// values. +func (s Sample) Bounds() (min float64, max float64) { + if len(s.Xs) == 0 || (!s.Sorted && s.Weights == nil) { + return Bounds(s.Xs) + } + + if s.Sorted { + if s.Weights == nil { + return s.Xs[0], s.Xs[len(s.Xs)-1] + } + min, max = math.NaN(), math.NaN() + for i, w := range s.Weights { + if w != 0 { + min = s.Xs[i] + break + } + } + if math.IsNaN(min) { + return + } + for i := range s.Weights { + if s.Weights[len(s.Weights)-i-1] != 0 { + max = s.Xs[len(s.Weights)-i-1] + break + } + } + } else { + min, max = math.Inf(1), math.Inf(-1) + for i, x := range s.Xs { + w := s.Weights[i] + if x < min && w != 0 { + min = x + } + if x > max && w != 0 { + max = x + } + } + if math.IsInf(min, 0) { + min, max = math.NaN(), math.NaN() + } + } + return +} + +// vecSum returns the sum of xs. +func vecSum(xs []float64) float64 { + sum := 0.0 + for _, x := range xs { + sum += x + } + return sum +} + +// Sum returns the (possibly weighted) sum of the Sample. +func (s Sample) Sum() float64 { + if s.Weights == nil { + return vecSum(s.Xs) + } + sum := 0.0 + for i, x := range s.Xs { + sum += x * s.Weights[i] + } + return sum +} + +// Weight returns the total weight of the Sasmple. +func (s Sample) Weight() float64 { + if s.Weights == nil { + return float64(len(s.Xs)) + } + return vecSum(s.Weights) +} + +// // Mean returns the arithmetic mean of xs. +// func Mean(xs []float64) float64 { +// if len(xs) == 0 { +// return math.NaN() +// } +// m := 0.0 +// for i, x := range xs { +// m += (x - m) / float64(i+1) +// } +// return m +// } + +// Mean returns the arithmetic mean of the Sample. +func (s Sample) Mean() float64 { + if len(s.Xs) == 0 || s.Weights == nil { + return Mean(s.Xs) + } + + m, wsum := 0.0, 0.0 + for i, x := range s.Xs { + // Use weighted incremental mean: + // m_i = (1 - w_i/wsum_i) * m_(i-1) + (w_i/wsum_i) * x_i + // = m_(i-1) + (x_i - m_(i-1)) * (w_i/wsum_i) + w := s.Weights[i] + wsum += w + m += (x - m) * w / wsum + } + return m +} + +// GeoMean returns the geometric mean of xs. xs must be positive. +func GeoMean(xs []float64) float64 { + if len(xs) == 0 { + return math.NaN() + } + m := 0.0 + for i, x := range xs { + if x <= 0 { + return math.NaN() + } + lx := math.Log(x) + m += (lx - m) / float64(i+1) + } + return math.Exp(m) +} + +// GeoMean returns the geometric mean of the Sample. All samples +// values must be positive. +func (s Sample) GeoMean() float64 { + if len(s.Xs) == 0 || s.Weights == nil { + return GeoMean(s.Xs) + } + + m, wsum := 0.0, 0.0 + for i, x := range s.Xs { + w := s.Weights[i] + wsum += w + lx := math.Log(x) + m += (lx - m) * w / wsum + } + return math.Exp(m) +} + +// Variance returns the sample variance of xs. +func Variance(xs []float64) float64 { + if len(xs) == 0 { + return math.NaN() + } else if len(xs) <= 1 { + return 0 + } + + // Based on Wikipedia's presentation of Welford 1962 + // (http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm). + // This is more numerically stable than the standard two-pass + // formula and not prone to massive cancellation. + mean, M2 := 0.0, 0.0 + for n, x := range xs { + delta := x - mean + mean += delta / float64(n+1) + M2 += delta * (x - mean) + } + return M2 / float64(len(xs)-1) +} + +// Variance returns the sample variance of xs. +func (s Sample) Variance() float64 { + if len(s.Xs) == 0 || s.Weights == nil { + return Variance(s.Xs) + } + // TODO(austin) + panic("Weighted Variance not implemented") +} + +// StdDev returns the sample standard deviation of xs. +func StdDev(xs []float64) float64 { + return math.Sqrt(Variance(xs)) +} + +// StdDev returns the sample standard deviation of the Sample. +func (s Sample) StdDev() float64 { + if len(s.Xs) == 0 || s.Weights == nil { + return StdDev(s.Xs) + } + // TODO(austin) + panic("Weighted StdDev not implemented") +} + +// Percentile returns the pctileth value from the Sample. This uses +// interpolation method R8 from Hyndman and Fan (1996). +// +// pctile will be capped to the range [0, 1]. If len(xs) == 0 or all +// weights are 0, returns NaN. +// +// Percentile(0.5) is the median. Percentile(0.25) and +// Percentile(0.75) are the first and third quartiles, respectively. +// +// This is constant time if s.Sorted and s.Weights == nil. +func (s Sample) Percentile(pctile float64) float64 { + if len(s.Xs) == 0 { + return math.NaN() + } else if pctile <= 0 { + min, _ := s.Bounds() + return min + } else if pctile >= 1 { + _, max := s.Bounds() + return max + } + + if !s.Sorted { + // TODO(austin) Use select algorithm instead + s = *s.Copy().Sort() + } + + if s.Weights == nil { + N := float64(len(s.Xs)) + //n := pctile * (N + 1) // R6 + n := 1/3.0 + pctile*(N+1/3.0) // R8 + kf, frac := math.Modf(n) + k := int(kf) + if k <= 0 { + return s.Xs[0] + } else if k >= len(s.Xs) { + return s.Xs[len(s.Xs)-1] + } + return s.Xs[k-1] + frac*(s.Xs[k]-s.Xs[k-1]) + } + // TODO(austin): Implement interpolation + + target := s.Weight() * pctile + + // TODO(austin) If we had cumulative weights, we could + // do this in log time. + for i, weight := range s.Weights { + target -= weight + if target < 0 { + return s.Xs[i] + } + } + return s.Xs[len(s.Xs)-1] +} + +// IQR returns the interquartile range of the Sample. +// +// This is constant time if s.Sorted and s.Weights == nil. +func (s Sample) IQR() float64 { + if !s.Sorted { + s = *s.Copy().Sort() + } + return s.Percentile(0.75) - s.Percentile(0.25) +} + +type sampleSorter struct { + xs []float64 + weights []float64 +} + +func (p *sampleSorter) Len() int { + return len(p.xs) +} + +func (p *sampleSorter) Less(i, j int) bool { + return p.xs[i] < p.xs[j] +} + +func (p *sampleSorter) Swap(i, j int) { + p.xs[i], p.xs[j] = p.xs[j], p.xs[i] + p.weights[i], p.weights[j] = p.weights[j], p.weights[i] +} + +// Sort sorts the samples in place in s and returns s. +// +// A sorted sample improves the performance of some algorithms. +func (s *Sample) Sort() *Sample { + if s.Sorted || sort.Float64sAreSorted(s.Xs) { + // All set + } else if s.Weights == nil { + sort.Float64s(s.Xs) + } else { + sort.Sort(&sampleSorter{s.Xs, s.Weights}) + } + s.Sorted = true + return s +} + +// Copy returns a copy of the Sample. +// +// The returned Sample shares no data with the original, so they can +// be modified (for example, sorted) independently. +func (s Sample) Copy() *Sample { + xs := make([]float64, len(s.Xs)) + copy(xs, s.Xs) + + weights := []float64(nil) + if s.Weights != nil { + weights = make([]float64, len(s.Weights)) + copy(weights, s.Weights) + } + + return &Sample{xs, weights, s.Sorted} +} diff --git a/pkg/obistats/scaler.go b/pkg/obistats/scaler.go new file mode 100644 index 0000000..72b8f9e --- /dev/null +++ b/pkg/obistats/scaler.go @@ -0,0 +1,119 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import ( + "fmt" + "strings" +) + +// A Scaler is a function that scales and formats a measurement. +// All measurements within a given table row are formatted +// using the same scaler, so that the units are consistent +// across the row. +type Scaler func(float64) string + +// NewScaler returns a Scaler appropriate for formatting +// the measurement val, which has the given unit. +func NewScaler(val float64, unit string) Scaler { + if hasBaseUnit(unit, "ns/op") || hasBaseUnit(unit, "ns/GC") { + return timeScaler(val) + } + + var format string + var scale float64 + var suffix string + + prescale := 1.0 + if hasBaseUnit(unit, "MB/s") { + prescale = 1e6 + } + + switch x := val * prescale; { + case x >= 99500000000000: + format, scale, suffix = "%.0f", 1e12, "T" + case x >= 9950000000000: + format, scale, suffix = "%.1f", 1e12, "T" + case x >= 995000000000: + format, scale, suffix = "%.2f", 1e12, "T" + case x >= 99500000000: + format, scale, suffix = "%.0f", 1e9, "G" + case x >= 9950000000: + format, scale, suffix = "%.1f", 1e9, "G" + case x >= 995000000: + format, scale, suffix = "%.2f", 1e9, "G" + case x >= 99500000: + format, scale, suffix = "%.0f", 1e6, "M" + case x >= 9950000: + format, scale, suffix = "%.1f", 1e6, "M" + case x >= 995000: + format, scale, suffix = "%.2f", 1e6, "M" + case x >= 99500: + format, scale, suffix = "%.0f", 1e3, "k" + case x >= 9950: + format, scale, suffix = "%.1f", 1e3, "k" + case x >= 995: + format, scale, suffix = "%.2f", 1e3, "k" + case x >= 99.5: + format, scale, suffix = "%.0f", 1, "" + case x >= 9.95: + format, scale, suffix = "%.1f", 1, "" + default: + format, scale, suffix = "%.2f", 1, "" + } + + if hasBaseUnit(unit, "B/op") || hasBaseUnit(unit, "bytes/op") || hasBaseUnit(unit, "bytes") { + suffix += "B" + } + if hasBaseUnit(unit, "MB/s") { + suffix += "B/s" + } + scale /= prescale + + return func(val float64) string { + return fmt.Sprintf(format+suffix, val/scale) + } +} + +func timeScaler(ns float64) Scaler { + var format string + var scale float64 + switch x := ns / 1e9; { + case x >= 99.5: + format, scale = "%.0fs", 1 + case x >= 9.95: + format, scale = "%.1fs", 1 + case x >= 0.995: + format, scale = "%.2fs", 1 + case x >= 0.0995: + format, scale = "%.0fms", 1000 + case x >= 0.00995: + format, scale = "%.1fms", 1000 + case x >= 0.000995: + format, scale = "%.2fms", 1000 + case x >= 0.0000995: + format, scale = "%.0fµs", 1000*1000 + case x >= 0.00000995: + format, scale = "%.1fµs", 1000*1000 + case x >= 0.000000995: + format, scale = "%.2fµs", 1000*1000 + case x >= 0.0000000995: + format, scale = "%.0fns", 1000*1000*1000 + case x >= 0.00000000995: + format, scale = "%.1fns", 1000*1000*1000 + default: + format, scale = "%.2fns", 1000*1000*1000 + } + return func(ns float64) string { + return fmt.Sprintf(format, ns/1e9*scale) + } +} + +// hasBaseUnit reports whether s has unit unit. +// For now, it reports whether s == unit or s ends in -unit. +func hasBaseUnit(s, unit string) bool { + return s == unit || strings.HasSuffix(s, "-"+unit) +} diff --git a/pkg/obistats/sort.go b/pkg/obistats/sort.go new file mode 100644 index 0000000..9b90884 --- /dev/null +++ b/pkg/obistats/sort.go @@ -0,0 +1,37 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import ( + "math" + "sort" +) + +// An Order defines a sort order for a table. +// It reports whether t.Rows[i] should appear before t.Rows[j]. +type Order func(t *Table, i, j int) bool + +// ByName sorts tables by the Benchmark name column +func ByName(t *Table, i, j int) bool { + return t.Rows[i].Benchmark < t.Rows[j].Benchmark +} + +// ByDelta sorts tables by the Delta column, +// reversing the order when larger is better (for "speed" results). +func ByDelta(t *Table, i, j int) bool { + return math.Abs(t.Rows[i].PctDelta)*float64(t.Rows[i].Change) < + math.Abs(t.Rows[j].PctDelta)*float64(t.Rows[j].Change) +} + +// Reverse returns the reverse of the given order. +func Reverse(order Order) Order { + return func(t *Table, i, j int) bool { return order(t, j, i) } +} + +// Sort sorts a Table t (in place) by the given order. +func Sort(t *Table, order Order) { + sort.SliceStable(t.Rows, func(i, j int) bool { return order(t, i, j) }) +} diff --git a/pkg/obistats/table.go b/pkg/obistats/table.go new file mode 100644 index 0000000..c0f064d --- /dev/null +++ b/pkg/obistats/table.go @@ -0,0 +1,213 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import ( + "fmt" + "strings" +) + +// A Table is a table for display in the benchstat output. +type Table struct { + Metric string + OldNewDelta bool // is this an old-new-delta table? + Configs []string + Groups []string + Rows []*Row +} + +// A Row is a table row for display in the benchstat output. +type Row struct { + Benchmark string // benchmark name + Group string // group name + Scaler Scaler // formatter for stats means + Metrics []*Metrics // columns of statistics + PctDelta float64 // unformatted percent change + Delta string // formatted percent change + Note string // additional information + Change int // +1 better, -1 worse, 0 unchanged +} + +// Tables returns tables comparing the benchmarks in the collection. +func (c *Collection) Tables() []*Table { + deltaTest := c.DeltaTest + if deltaTest == nil { + deltaTest = UTest + } + alpha := c.Alpha + if alpha == 0 { + alpha = 0.05 + } + + // Update statistics. + for _, m := range c.Metrics { + m.computeStats() + } + + var tables []*Table + key := Key{} + for _, key.Unit = range c.Units { + table := new(Table) + table.Configs = c.Configs + table.Groups = c.Groups + table.Metric = metricOf(key.Unit) + table.OldNewDelta = len(c.Configs) == 2 + for _, key.Group = range c.Groups { + for _, key.Benchmark = range c.Benchmarks[key.Group] { + row := &Row{Benchmark: key.Benchmark} + if len(c.Groups) > 1 { + // Show group headers if there is more than one group. + row.Group = key.Group + } + + for _, key.Config = range c.Configs { + m := c.Metrics[key] + if m == nil { + row.Metrics = append(row.Metrics, new(Metrics)) + continue + } + row.Metrics = append(row.Metrics, m) + if row.Scaler == nil { + row.Scaler = NewScaler(m.Mean, m.Unit) + } + } + + // If there are only two configs being compared, add stats. + if table.OldNewDelta { + k0 := key + k0.Config = c.Configs[0] + k1 := key + k1.Config = c.Configs[1] + old := c.Metrics[k0] + new := c.Metrics[k1] + // If one is missing, omit row entirely. + // TODO: Control this better. + if old == nil || new == nil { + continue + } + pval, testerr := deltaTest(old, new) + row.PctDelta = 0.00 + row.Delta = "~" + if testerr == ErrZeroVariance { + row.Note = "(zero variance)" + } else if testerr == ErrSampleSize { + row.Note = "(too few samples)" + } else if testerr == ErrSamplesEqual { + row.Note = "(all equal)" + } else if testerr != nil { + row.Note = fmt.Sprintf("(%s)", testerr) + } else if pval < alpha { + if new.Mean == old.Mean { + row.Delta = "0.00%" + } else { + pct := ((new.Mean / old.Mean) - 1.0) * 100.0 + row.PctDelta = pct + row.Delta = fmt.Sprintf("%+.2f%%", pct) + if pct < 0 == (table.Metric != "speed") { // smaller is better, except speeds + row.Change = +1 + } else { + row.Change = -1 + } + } + } + if row.Note == "" && pval != -1 { + row.Note = fmt.Sprintf("(p=%0.3f n=%d+%d)", pval, len(old.RValues), len(new.RValues)) + } + } + + table.Rows = append(table.Rows, row) + } + } + + if len(table.Rows) > 0 { + if c.Order != nil { + Sort(table, c.Order) + } + if c.AddGeoMean { + addGeomean(c, table, key.Unit, table.OldNewDelta) + } + tables = append(tables, table) + } + } + + return tables +} + +var metricSuffix = map[string]string{ + "ns/op": "time/op", + "ns/GC": "time/GC", + "B/op": "alloc/op", + "MB/s": "speed", +} + +// metricOf returns the name of the metric with the given unit. +func metricOf(unit string) string { + if s := metricSuffix[unit]; s != "" { + return s + } + for s, suff := range metricSuffix { + if dashs := "-" + s; strings.HasSuffix(unit, dashs) { + prefix := strings.TrimSuffix(unit, dashs) + return prefix + "-" + suff + } + } + return unit +} + +// addGeomean adds a "geomean" row to the table, +// showing the geometric mean of all the benchmarks. +func addGeomean(c *Collection, t *Table, unit string, delta bool) { + row := &Row{Benchmark: "[Geo mean]"} + key := Key{Unit: unit} + geomeans := []float64{} + maxCount := 0 + for _, key.Config = range c.Configs { + var means []float64 + for _, key.Group = range c.Groups { + for _, key.Benchmark = range c.Benchmarks[key.Group] { + m := c.Metrics[key] + // Omit 0 values from the geomean calculation, + // as these either make the geomean undefined + // or zero (depending on who you ask). This + // typically comes up with things like + // allocation counts, where it's fine to just + // ignore the benchmark. + if m != nil && m.Mean != 0 { + means = append(means, m.Mean) + } + } + } + if len(means) > maxCount { + maxCount = len(means) + } + if len(means) == 0 { + row.Metrics = append(row.Metrics, new(Metrics)) + delta = false + } else { + geomean := GeoMean(means) + geomeans = append(geomeans, geomean) + if row.Scaler == nil { + row.Scaler = NewScaler(geomean, unit) + } + row.Metrics = append(row.Metrics, &Metrics{ + Unit: unit, + Mean: geomean, + }) + } + } + if maxCount <= 1 { + // Only one benchmark contributed to this geomean. + // Since the geomean is the same as the benchmark + // result, don't bother outputting it. + return + } + if delta { + pct := ((geomeans[1] / geomeans[0]) - 1.0) * 100.0 + row.PctDelta = pct + row.Delta = fmt.Sprintf("%+.2f%%", pct) + } + t.Rows = append(t.Rows, row) +} diff --git a/pkg/obistats/tdist.go b/pkg/obistats/tdist.go new file mode 100644 index 0000000..0ec739c --- /dev/null +++ b/pkg/obistats/tdist.go @@ -0,0 +1,37 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import "math" + +// A TDist is a Student's t-distribution with V degrees of freedom. +type TDist struct { + V float64 +} + +// PDF is the probability distribution function +func (t TDist) PDF(x float64) float64 { + return math.Exp(lgamma((t.V+1)/2)-lgamma(t.V/2)) / + math.Sqrt(t.V*math.Pi) * math.Pow(1+(x*x)/t.V, -(t.V+1)/2) +} + +// CDF is the cumulative distribution function +func (t TDist) CDF(x float64) float64 { + if x == 0 { + return 0.5 + } else if x > 0 { + return 1 - 0.5*mathBetaInc(t.V/(t.V+x*x), t.V/2, 0.5) + } else if x < 0 { + return 1 - t.CDF(-x) + } else { + return math.NaN() + } +} + +// Bounds ... +func (t TDist) Bounds() (float64, float64) { + return -4, 4 +} diff --git a/pkg/obistats/ttest.go b/pkg/obistats/ttest.go new file mode 100644 index 0000000..7a04999 --- /dev/null +++ b/pkg/obistats/ttest.go @@ -0,0 +1,143 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import ( + "math" +) + +// A TTestResult is the result of a t-test. +type TTestResult struct { + // N1 and N2 are the sizes of the input samples. For a + // one-sample t-test, N2 is 0. + N1, N2 int + + // T is the value of the t-statistic for this t-test. + T float64 + + // DoF is the degrees of freedom for this t-test. + DoF float64 + + // AltHypothesis specifies the alternative hypothesis tested + // by this test against the null hypothesis that there is no + // difference in the means of the samples. + AltHypothesis LocationHypothesis + + // P is p-value for this t-test for the given null hypothesis. + P float64 +} + +func newTTestResult(n1, n2 int, t, dof float64, alt LocationHypothesis) *TTestResult { + dist := TDist{dof} + var p float64 + switch alt { + case LocationDiffers: + p = 2 * (1 - dist.CDF(math.Abs(t))) + case LocationLess: + p = dist.CDF(t) + case LocationGreater: + p = 1 - dist.CDF(t) + } + return &TTestResult{N1: n1, N2: n2, T: t, DoF: dof, AltHypothesis: alt, P: p} +} + +// A TTestSample is a sample that can be used for a one or two sample +// t-test. +type TTestSample interface { + Weight() float64 + Mean() float64 + Variance() float64 +} + +var () + +// TwoSampleTTest performs a two-sample (unpaired) Student's t-test on +// samples x1 and x2. This is a test of the null hypothesis that x1 +// and x2 are drawn from populations with equal means. It assumes x1 +// and x2 are independent samples, that the distributions have equal +// variance, and that the populations are normally distributed. +func TwoSampleTTest(x1, x2 TTestSample, alt LocationHypothesis) (*TTestResult, error) { + n1, n2 := x1.Weight(), x2.Weight() + if n1 == 0 || n2 == 0 { + return nil, ErrSampleSize + } + v1, v2 := x1.Variance(), x2.Variance() + if v1 == 0 && v2 == 0 { + return nil, ErrZeroVariance + } + + dof := n1 + n2 - 2 + v12 := ((n1-1)*v1 + (n2-1)*v2) / dof + t := (x1.Mean() - x2.Mean()) / math.Sqrt(v12*(1/n1+1/n2)) + return newTTestResult(int(n1), int(n2), t, dof, alt), nil +} + +// TwoSampleWelchTTest performs a two-sample (unpaired) Welch's t-test +// on samples x1 and x2. This is like TwoSampleTTest, but does not +// assume the distributions have equal variance. +func TwoSampleWelchTTest(x1, x2 TTestSample, alt LocationHypothesis) (*TTestResult, error) { + n1, n2 := x1.Weight(), x2.Weight() + if n1 <= 1 || n2 <= 1 { + // TODO: Can we still do this with n == 1? + return nil, ErrSampleSize + } + v1, v2 := x1.Variance(), x2.Variance() + if v1 == 0 && v2 == 0 { + return nil, ErrZeroVariance + } + + dof := math.Pow(v1/n1+v2/n2, 2) / + (math.Pow(v1/n1, 2)/(n1-1) + math.Pow(v2/n2, 2)/(n2-1)) + s := math.Sqrt(v1/n1 + v2/n2) + t := (x1.Mean() - x2.Mean()) / s + return newTTestResult(int(n1), int(n2), t, dof, alt), nil +} + +// PairedTTest performs a two-sample paired t-test on samples x1 and +// x2. If μ0 is non-zero, this tests if the average of the difference +// is significantly different from μ0. If x1 and x2 are identical, +// this returns nil. +func PairedTTest(x1, x2 []float64, μ0 float64, alt LocationHypothesis) (*TTestResult, error) { + if len(x1) != len(x2) { + return nil, ErrMismatchedSamples + } + if len(x1) <= 1 { + // TODO: Can we still do this with n == 1? + return nil, ErrSampleSize + } + + dof := float64(len(x1) - 1) + + diff := make([]float64, len(x1)) + for i := range x1 { + diff[i] = x1[i] - x2[i] + } + sd := StdDev(diff) + if sd == 0 { + // TODO: Can we still do the test? + return nil, ErrZeroVariance + } + t := (Mean(diff) - μ0) * math.Sqrt(float64(len(x1))) / sd + return newTTestResult(len(x1), len(x2), t, dof, alt), nil +} + +// OneSampleTTest performs a one-sample t-test on sample x. This tests +// the null hypothesis that the population mean is equal to μ0. This +// assumes the distribution of the population of sample means is +// normal. +func OneSampleTTest(x TTestSample, μ0 float64, alt LocationHypothesis) (*TTestResult, error) { + n, v := x.Weight(), x.Variance() + if n == 0 { + return nil, ErrSampleSize + } + if v == 0 { + // TODO: Can we still do the test? + return nil, ErrZeroVariance + } + dof := n - 1 + t := (x.Mean() - μ0) * math.Sqrt(n) / math.Sqrt(v) + return newTTestResult(int(n), 0, t, dof, alt), nil +} diff --git a/pkg/obistats/udist.go b/pkg/obistats/udist.go new file mode 100644 index 0000000..4a0bc08 --- /dev/null +++ b/pkg/obistats/udist.go @@ -0,0 +1,386 @@ +package obistats + +// +// Dupplicated code from internal module available at : +// https://github.com/golang-design/bench.git +// + +import "math" + +// A UDist is the discrete probability distribution of the +// Mann-Whitney U statistic for a pair of samples of sizes N1 and N2. +// +// The details of computing this distribution with no ties can be +// found in Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of +// Whether one of Two Random Variables is Stochastically Larger than +// the Other". Annals of Mathematical Statistics 18 (1): 50–60. +// Computing this distribution in the presence of ties is described in +// Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer". +// Journal of the American Statistical Association 61 (315): 772-787 +// and Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney +// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: +// 805-813 (the former paper contains details that are glossed over in +// the latter paper but has mathematical typesetting issues, so it's +// easiest to get the context from the former paper and the details +// from the latter). +type UDist struct { + N1, N2 int + + // T is the count of the number of ties at each rank in the + // input distributions. T may be nil, in which case it is + // assumed there are no ties (which is equivalent to an M+N + // slice of 1s). It must be the case that Sum(T) == M+N. + T []int +} + +// hasTies returns true if d has any tied samples. +func (d UDist) hasTies() bool { + for _, t := range d.T { + if t > 1 { + return true + } + } + return false +} + +// p returns the p_{d.N1,d.N2} function defined by Mann, Whitney 1947 +// for values of U from 0 up to and including the U argument. +// +// This algorithm runs in Θ(N1*N2*U) = O(N1²N2²) time and is quite +// fast for small values of N1 and N2. However, it does not handle ties. +func (d UDist) p(U int) []float64 { + // This is a dynamic programming implementation of the + // recursive recurrence definition given by Mann and Whitney: + // + // p_{n,m}(U) = (n * p_{n-1,m}(U-m) + m * p_{n,m-1}(U)) / (n+m) + // p_{n,m}(U) = 0 if U < 0 + // p_{0,m}(U) = p{n,0}(U) = 1 / nCr(m+n, n) if U = 0 + // = 0 if U > 0 + // + // (Note that there is a typo in the original paper. The first + // recursive application of p should be for U-m, not U-M.) + // + // Since p{n,m} only depends on p{n-1,m} and p{n,m-1}, we only + // need to store one "plane" of the three dimensional space at + // a time. + // + // Furthermore, p_{n,m} = p_{m,n}, so we only construct values + // for n <= m and obtain the rest through symmetry. + // + // We organize the computed values of p as followed: + // + // n → N + // m * + // ↓ * * + // * * * + // * * * * + // * * * * + // M * * * * + // + // where each * is a slice indexed by U. The code below + // computes these left-to-right, top-to-bottom, so it only + // stores one row of this matrix at a time. Furthermore, + // computing an element in a given U slice only depends on the + // same and smaller values of U, so we can overwrite the U + // slice we're computing in place as long as we start with the + // largest value of U. Finally, even though the recurrence + // depends on (n,m) above the diagonal and we use symmetry to + // mirror those across the diagonal to (m,n), the mirrored + // indexes are always available in the current row, so this + // mirroring does not interfere with our ability to recycle + // state. + + N, M := d.N1, d.N2 + if N > M { + N, M = M, N + } + + memo := make([][]float64, N+1) + for n := range memo { + memo[n] = make([]float64, U+1) + } + + for m := 0; m <= M; m++ { + // Compute p_{0,m}. This is zero except for U=0. + memo[0][0] = 1 + + // Compute the remainder of this row. + nlim := N + if m < nlim { + nlim = m + } + for n := 1; n <= nlim; n++ { + lp := memo[n-1] // p_{n-1,m} + var rp []float64 + if n <= m-1 { + rp = memo[n] // p_{n,m-1} + } else { + rp = memo[m-1] // p{m-1,n} and m==n + } + + // For a given n,m, U is at most n*m. + // + // TODO: Actually, it's at most ⌈n*m/2⌉, but + // then we need to use more complex symmetries + // in the inner loop below. + ulim := n * m + if U < ulim { + ulim = U + } + + out := memo[n] // p_{n,m} + nplusm := float64(n + m) + for U1 := ulim; U1 >= 0; U1-- { + l := 0.0 + if U1-m >= 0 { + l = float64(n) * lp[U1-m] + } + r := float64(m) * rp[U1] + out[U1] = (l + r) / nplusm + } + } + } + return memo[N] +} + +type ukey struct { + n1 int // size of first sample + twoU int // 2*U statistic for this permutation +} + +// This computes the cumulative counts of the Mann-Whitney U +// distribution in the presence of ties using the computation from +// Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney +// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: +// 805-813, with much guidance from appendix L of Klotz, A +// Computational Approach to Statistics. +// +// makeUmemo constructs a table memo[K][ukey{n1, 2*U}], where K is the +// number of ranks (up to len(t)), n1 is the size of the first sample +// (up to the n1 argument), and U is the U statistic (up to the +// argument twoU/2). The value of an entry in the memo table is the +// number of permutations of a sample of size n1 in a ranking with tie +// vector t[:K] having a U statistic <= U. +func makeUmemo(twoU, n1 int, t []int) []map[ukey]float64 { + // Another candidate for a fast implementation is van de Wiel, + // "The split-up algorithm: a fast symbolic method for + // computing p-values of distribution-free statistics". This + // is what's used by R's coin package. It's a comparatively + // recent publication, so it's presumably faster (or perhaps + // just more general) than previous techniques, but I can't + // get my hands on the paper. + // + // TODO: ~40% of this function's time is spent in mapassign on + // the assignment lines in the two loops and another ~20% in + // map access and iteration. Improving map behavior or + // replacing the maps altogether with some other constant-time + // structure could double performance. + // + // TODO: The worst case for this function is when there are + // few ties. Yet the best case overall is when there are *no* + // ties. Can we get the best of both worlds? Use the fast + // algorithm for the most part when there are few ties and mix + // in the general algorithm just where we need it? That's + // certainly possible for sub-problems where t[:k] has no + // ties, but that doesn't help if t[0] has a tie but nothing + // else does. Is it possible to rearrange the ranks without + // messing up our computation of the U statistic for + // sub-problems? + + K := len(t) + + // Compute a coefficients. The a slice is indexed by k (a[0] + // is unused). + a := make([]int, K+1) + a[1] = t[0] + for k := 2; k <= K; k++ { + a[k] = a[k-1] + t[k-2] + t[k-1] + } + + // Create the memo table for the counts function, A. The A + // slice is indexed by k (A[0] is unused). + // + // In "The Mann Whitney Distribution Using Linked Lists", they + // use linked lists (*gasp*) for this, but within each K it's + // really just a memoization table, so it's faster to use a + // map. The outer structure is a slice indexed by k because we + // need to find all memo entries with certain values of k. + // + // TODO: The n1 and twoU values in the ukeys follow strict + // patterns. For each K value, the n1 values are every integer + // between two bounds. For each (K, n1) value, the twoU values + // are every integer multiple of a certain base between two + // bounds. It might be worth turning these into directly + // indexible slices. + A := make([]map[ukey]float64, K+1) + A[K] = map[ukey]float64{ukey{n1: n1, twoU: twoU}: 0} + + // Compute memo table (k, n1, twoU) triples from high K values + // to low K values. This drives the recurrence relation + // downward to figure out all of the needed argument triples. + // + // TODO: Is it possible to generate this table bottom-up? If + // so, this could be a pure dynamic programming algorithm and + // we could discard the K dimension. We could at least store + // the inputs in a more compact representation that replaces + // the twoU dimension with an interval and a step size (as + // suggested by Cheung, Klotz, not that they make it at all + // clear *why* they're suggesting this). + tsum := sumint(t) // always ∑ t[0:k] + for k := K - 1; k >= 2; k-- { + tsum -= t[k] + A[k] = make(map[ukey]float64) + + // Construct A[k] from A[k+1]. + for A_kplus1 := range A[k+1] { + rkLow := maxint(0, A_kplus1.n1-tsum) + rkHigh := minint(A_kplus1.n1, t[k]) + for rk := rkLow; rk <= rkHigh; rk++ { + twoU_k := A_kplus1.twoU - rk*(a[k+1]-2*A_kplus1.n1+rk) + n1_k := A_kplus1.n1 - rk + if twoUmin(n1_k, t[:k], a) <= twoU_k && twoU_k <= twoUmax(n1_k, t[:k], a) { + key := ukey{n1: n1_k, twoU: twoU_k} + A[k][key] = 0 + } + } + } + } + + // Fill counts in memo table from low K values to high K + // values. This unwinds the recurrence relation. + + // Start with K==2 base case. + // + // TODO: Later computations depend on these, but these don't + // depend on anything (including each other), so if K==2, we + // can skip the memo table altogether. + if K < 2 { + panic("K < 2") + } + N_2 := t[0] + t[1] + for A_2i := range A[2] { + Asum := 0.0 + r2Low := maxint(0, A_2i.n1-t[0]) + r2High := (A_2i.twoU - A_2i.n1*(t[0]-A_2i.n1)) / N_2 + for r2 := r2Low; r2 <= r2High; r2++ { + Asum += mathChoose(t[0], A_2i.n1-r2) * + mathChoose(t[1], r2) + } + A[2][A_2i] = Asum + } + + // Derive counts for the rest of the memo table. + tsum = t[0] // always ∑ t[0:k-1] + for k := 3; k <= K; k++ { + tsum += t[k-2] + + // Compute A[k] counts from A[k-1] counts. + for A_ki := range A[k] { + Asum := 0.0 + rkLow := maxint(0, A_ki.n1-tsum) + rkHigh := minint(A_ki.n1, t[k-1]) + for rk := rkLow; rk <= rkHigh; rk++ { + twoU_kminus1 := A_ki.twoU - rk*(a[k]-2*A_ki.n1+rk) + n1_kminus1 := A_ki.n1 - rk + x, ok := A[k-1][ukey{n1: n1_kminus1, twoU: twoU_kminus1}] + if !ok && twoUmax(n1_kminus1, t[:k-1], a) < twoU_kminus1 { + x = mathChoose(tsum, n1_kminus1) + } + Asum += x * mathChoose(t[k-1], rk) + } + A[k][A_ki] = Asum + } + } + + return A +} + +func twoUmin(n1 int, t, a []int) int { + K := len(t) + twoU := -n1 * n1 + n1_k := n1 + for k := 1; k <= K; k++ { + twoU_k := minint(n1_k, t[k-1]) + twoU += twoU_k * a[k] + n1_k -= twoU_k + } + return twoU +} + +func twoUmax(n1 int, t, a []int) int { + K := len(t) + twoU := -n1 * n1 + n1_k := n1 + for k := K; k > 0; k-- { + twoU_k := minint(n1_k, t[k-1]) + twoU += twoU_k * a[k] + n1_k -= twoU_k + } + return twoU +} + +func (d UDist) PMF(U float64) float64 { + if U < 0 || U >= 0.5+float64(d.N1*d.N2) { + return 0 + } + + if d.hasTies() { + // makeUmemo computes the CDF directly. Take its + // difference to get the PMF. + p1, ok1 := makeUmemo(int(2*U)-1, d.N1, d.T)[len(d.T)][ukey{d.N1, int(2*U) - 1}] + p2, ok2 := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] + if !ok1 || !ok2 { + panic("makeUmemo did not return expected memoization table") + } + return (p2 - p1) / mathChoose(d.N1+d.N2, d.N1) + } + + // There are no ties. Use the fast algorithm. U must be integral. + Ui := int(math.Floor(U)) + // TODO: Use symmetry to minimize U + return d.p(Ui)[Ui] +} + +func (d UDist) CDF(U float64) float64 { + if U < 0 { + return 0 + } else if U >= float64(d.N1*d.N2) { + return 1 + } + + if d.hasTies() { + // TODO: Minimize U? + p, ok := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] + if !ok { + panic("makeUmemo did not return expected memoization table") + } + return p / mathChoose(d.N1+d.N2, d.N1) + } + + // There are no ties. Use the fast algorithm. U must be integral. + Ui := int(math.Floor(U)) + // The distribution is symmetric around U = m * n / 2. Sum up + // whichever tail is smaller. + flip := Ui >= (d.N1*d.N2+1)/2 + if flip { + Ui = d.N1*d.N2 - Ui - 1 + } + pdfs := d.p(Ui) + p := 0.0 + for _, pdf := range pdfs[:Ui+1] { + p += pdf + } + if flip { + p = 1 - p + } + return p +} + +func (d UDist) Step() float64 { + return 0.5 +} + +func (d UDist) Bounds() (float64, float64) { + // TODO: More precise bounds when there are ties. + return 0, float64(d.N1 * d.N2) +} diff --git a/pkg/obitools/obicleandb/obicleandb.go b/pkg/obitools/obicleandb/obicleandb.go index b3e6956..0f752d0 100644 --- a/pkg/obitools/obicleandb/obicleandb.go +++ b/pkg/obitools/obicleandb/obicleandb.go @@ -1,6 +1,8 @@ package obicleandb import ( + "math/rand" + log "github.com/sirupsen/logrus" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" @@ -18,6 +20,114 @@ func SequenceTrust(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error return obiseq.BioSequenceSlice{sequence}, nil } +func MakeSequenceFamilyGenusWorker(references obiseq.BioSequenceSlice) obiseq.SeqWorker { + + genus := make(map[int]*obiseq.BioSequenceSlice) + family := make(map[int]*obiseq.BioSequenceSlice) + + for _, ref := range references { + g, ok := ref.GetIntAttribute("genus_taxid") + f, ok := ref.GetIntAttribute("family_taxid") + + gs, ok := genus[g] + if !ok { + gs = obiseq.NewBioSequenceSlice(0) + genus[g] = gs + } + + *gs = append(*gs, ref) + + fs, ok := family[f] + if !ok { + fs = obiseq.NewBioSequenceSlice(0) + family[f] = fs + } + + *fs = append(*fs, ref) + } + + f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + g, _ := sequence.GetIntAttribute("genus_taxid") + sequence.SetAttribute("obicleandb_level", "genus") + + gs := genus[g] + + indist := make([]float64, 0, gs.Len()) + for _, s := range *gs { + if s != sequence { + lca, lali := obialign.FastLCSScore(sequence, s, -1, nil) + indist = append(indist, float64(lali-lca)) + } + } + nindist := len(indist) + + pval := 0.0 + + f, _ := sequence.GetIntAttribute("family_taxid") + fs := family[f] + + if nindist < 5 { + sequence.SetAttribute("obicleandb_level", "family") + + for _, s := range *fs { + gf, _ := s.GetIntAttribute("genus_taxid") + if g != gf { + lca, lali := obialign.FastLCSScore(sequence, s, -1, nil) + indist = append(indist, float64(lali-lca)) + } + } + + nindist = len(indist) + } + + if nindist > 0 { + + next := nindist + if next <= 20 { + next = 20 + } + + outdist := make([]float64, 0, nindist) + p := rand.Perm(references.Len()) + i := 0 + for _, ir := range p { + s := references[ir] + ff, _ := s.GetIntAttribute("family_taxid") + + if ff != f { + lca, lali := obialign.FastLCSScore(sequence, s, -1, nil) + outdist = append(outdist, float64(lali-lca)) + i += 1 + if i >= next { + break + } + } + } + + res, err := obistats.MannWhitneyUTest(outdist, indist, obistats.LocationGreater) + + if err == nil { + pval = res.P + } + + level, _ := sequence.GetAttribute("obicleandb_level") + log.Warnf("%s - level: %v", sequence.Id(), level) + log.Warnf("%s - gdist: %v", sequence.Id(), indist) + log.Warnf("%s - fdist: %v", sequence.Id(), outdist) + log.Warnf("%s - pval: %f", sequence.Id(), pval) + } else { + sequence.SetAttribute("obicleandb_level", "none") + } + + sequence.SetAttribute("obicleandb_trusted", pval) + + return obiseq.BioSequenceSlice{sequence}, nil + } + + return f + +} + func diagCoord(x, y, n int) int { if x > y { x, y = y, x @@ -160,19 +270,26 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence { obioptions.CLIParallelWorkers(), ) - genera_iterator, err := obichunk.ISequenceChunk( - annotated, - obiseq.AnnotationClassifier("genus_taxid", "NA"), - ) + references := annotated.Load() - if err != nil { - log.Fatal(err) - } + mannwithney := MakeSequenceFamilyGenusWorker(references) - trusted := genera_iterator.MakeISliceWorker( - SequenceTrustSlice, - false, - ) + partof := obiiter.IBatchOver(references, + obioptions.CLIBatchSize()).Speed("Testing belonging to genus") - return trusted + // genera_iterator, err := obichunk.ISequenceChunk( + // annotated, + // obiseq.AnnotationClassifier("genus_taxid", "NA"), + // ) + + // if err != nil { + // log.Fatal(err) + // } + + // trusted := genera_iterator.MakeISliceWorker( + // SequenceTrustSlice, + // false, + // ) + + return partof.MakeIWorker(mannwithney, true) } diff --git a/pkg/obitools/obidemerge/demerge.go b/pkg/obitools/obidemerge/demerge.go new file mode 100644 index 0000000..ece68b2 --- /dev/null +++ b/pkg/obitools/obidemerge/demerge.go @@ -0,0 +1,44 @@ +package obidemerge + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +func MakeDemergeWorker(key string) obiseq.SeqWorker { + desc := obiseq.MakeStatsOnDescription(key) + f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + + if sequence.HasStatsOn(key) { + stats := sequence.StatsOn(desc, "NA") + sequence.DeleteAttribute(obiseq.StatsOnSlotName(key)) + slice := obiseq.NewBioSequenceSlice(len(stats)) + i := 0 + + for k, v := range stats { + (*slice)[i] = sequence.Copy() + (*slice)[i].SetAttribute(key, k) + (*slice)[i].SetCount(v) + i++ + } + + return *slice, nil + } + + return obiseq.BioSequenceSlice{sequence}, nil + } + + return obiseq.SeqWorker(f) +} + +func CLIDemergeSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence { + + if CLIHasSlotToDemerge() { + + worker := MakeDemergeWorker(CLIDemergeSlot()) + return iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers(), 0) + } + + return iterator +} diff --git a/pkg/obitools/obidemerge/options.go b/pkg/obitools/obidemerge/options.go new file mode 100644 index 0000000..af6e970 --- /dev/null +++ b/pkg/obitools/obidemerge/options.go @@ -0,0 +1,30 @@ +package obidemerge + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +var _Demerge = "" + +func DemergeOptionSet(options *getoptions.GetOpt) { + + options.StringVar(&_Demerge, "demerge", _Demerge, + options.Alias("d"), + options.Description("Indicates which slot has to be demerged.")) +} + +// OptionSet adds to the basic option set every options declared for +// the obipcr command +func OptionSet(options *getoptions.GetOpt) { + obiconvert.OptionSet(options) + DemergeOptionSet(options) +} + +func CLIDemergeSlot() string { + return _Demerge +} + +func CLIHasSlotToDemerge() bool { + return _Demerge != "" +} diff --git a/pkg/obitools/obijoin/join.go b/pkg/obitools/obijoin/join.go new file mode 100644 index 0000000..a8f5520 --- /dev/null +++ b/pkg/obitools/obijoin/join.go @@ -0,0 +1,132 @@ +package obijoin + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + + log "github.com/sirupsen/logrus" +) + +type IndexedSequenceSlice struct { + Sequences obiseq.BioSequenceSlice + Indices []map[interface{}]*obiutils.Set[int] +} + +func (s IndexedSequenceSlice) Len() int { + return len(s.Sequences) +} + +func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice { + var keeps obiutils.Set[int] + + for i, v := range s.Indices { + if i == 0 { + keeps = *v[keys[0]] + } else { + keeps = keeps.Intersection(*v[keys[i]]) + } + } + + rep := obiseq.MakeBioSequenceSlice(len(keeps)) + for i, v := range keeps.Members() { + rep[i] = s.Sequences[v] + } + + return &rep +} + +func BuildIndexedSequenceSlice(seqs obiseq.BioSequenceSlice, keys []string) IndexedSequenceSlice { + indices := make([]map[interface{}]*obiutils.Set[int], len(keys)) + + for i, k := range keys { + idx := make(map[interface{}]*obiutils.Set[int]) + + for j, seq := range seqs { + + if value, ok := seq.GetAttribute(k); ok { + goods, ok := idx[value] + if !ok { + goods = obiutils.NewSet[int]() + idx[value] = goods + } + + goods.Add(j) + } + } + + indices[i] = idx + } + + return IndexedSequenceSlice{seqs, indices} +} + +func MakeJoinWorker(by []string, index IndexedSequenceSlice, updateId, updateSequence, updateQuality bool) obiseq.SeqWorker { + f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + var ok bool + + keys := make([]interface{}, len(by)) + + for i, v := range by { + keys[i], ok = sequence.GetAttribute(v) + if !ok { + return obiseq.BioSequenceSlice{sequence}, nil + } + } + + join_with := index.Get(keys...) + + rep := obiseq.MakeBioSequenceSlice(join_with.Len()) + + if join_with.Len() == 0 { + return obiseq.BioSequenceSlice{sequence}, nil + } + + for i, v := range *join_with { + rep[i] = sequence.Copy() + annot := rep[i].Annotations() + new_annot := v.Annotations() + + for k, v := range new_annot { + annot[k] = v + } + + if updateId { + rep[i].SetId(v.Id()) + } + if updateSequence && len(v.Sequence()) > 0 { + rep[i].SetSequence(v.Sequence()) + } + if updateQuality && len(v.Qualities()) > 0 { + rep[i].SetQualities(v.Qualities()) + } + } + + return rep, nil + } + + return obiseq.SeqWorker(f) +} + +func CLIJoinSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence { + + data_iter, err := obiformats.ReadSequencesFromFile(CLIJoinWith()) + + if err != nil { + log.Fatalf("Cannot read the data file to merge with: %s %v", CLIJoinWith(), err) + } + + data := data_iter.Load() + + keys := CLIBy() + + index := BuildIndexedSequenceSlice(data, keys.Right) + + worker := MakeJoinWorker(keys.Left, index, CLIUpdateId(), CLIUpdateSequence(), CLIUpdateQuality()) + + iterator = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers()) + + return iterator +} diff --git a/pkg/obitools/obijoin/options.go b/pkg/obitools/obijoin/options.go new file mode 100644 index 0000000..aab9aff --- /dev/null +++ b/pkg/obitools/obijoin/options.go @@ -0,0 +1,90 @@ +package obijoin + +import ( + "strings" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +var _by = []string{} +var _join = "" +var _UpdateID = false +var _UpdateSequence = false +var _UpdateQuality = false + +type By struct { + Left []string + Right []string +} + +func JoinOptionSet(options *getoptions.GetOpt) { + + options.StringSliceVar(&_by, "by", 1, 1, + options.Alias("b"), + options.Description("to declare join keys.")) + + options.StringVar(&_join, "join-with", _join, + options.Alias("j"), + options.Description("file name of the file to join with."), + options.Required("You must provide a file name to join with.")) + + options.BoolVar(&_UpdateID, "update-id", _UpdateID, + options.Alias("i"), + options.Description("Update the sequence IDs in the joined file.")) + + options.BoolVar(&_UpdateSequence, "update-sequence", _UpdateSequence, + options.Alias("s"), + options.Description("Update the sequence in the joined file.")) + + options.BoolVar(&_UpdateQuality, "update-quality", _UpdateQuality, + options.Alias("q"), + options.Description("Update the quality in the joined file.")) + +} + +// OptionSet adds to the basic option set every options declared for +// the obipcr command +func OptionSet(options *getoptions.GetOpt) { + obiconvert.OptionSet(options) + JoinOptionSet(options) +} + +func CLIBy() By { + if len(_by) == 0 { + return By{ + Left: []string{"id"}, + Right: []string{"id"}, + } + } + + left := make([]string, len(_by)) + right := make([]string, len(_by)) + + for i, v := range _by { + vals := strings.Split(v, "=") + left[i] = vals[0] + right[i] = vals[0] + if len(vals) > 1 { + right[i] = vals[1] + } + } + + return By{Left: left, Right: right} +} + +func CLIJoinWith() string { + return _join +} + +func CLIUpdateId() bool { + return _UpdateID +} + +func CLIUpdateSequence() bool { + return _UpdateSequence +} + +func CLIUpdateQuality() bool { + return _UpdateQuality +}