first version of obidemerge, obijoin and a new filter for obicleandb but to be finnished

Former-commit-id: 8a1ed26e5548c30db75644c294d478ec4d753f19
This commit is contained in:
Eric Coissac
2024-07-10 15:21:42 +02:00
parent bd855c4965
commit c7ed47e110
24 changed files with 2712 additions and 19 deletions

View File

@ -20,6 +20,24 @@
### New features
- Adds a new `obijoin` utility to join information contained in a sequence
file with that contained in another sequence or CSV file. The command allows
you to specify the names of the keys in the main sequence file and in the
secondary data file that will be used to perform the join operation.
- Adds a new tool `obidemerge` to demerge a `merge_xxx` slot by recreating the
multiple identical sequences having the slot `xxx` recreated with its initial
value and the sequence count set to the number of occurences refered in the
`merge_xxx` slot. During the operation, the `merge_xxx` slot is removed.
- Adds CSV as one of the input format for every obitools command. To encode
sequence the CSV file must includes a column named `sequence` and another
column named `id`. An extra column named `qualities` can be added to specify
the quality scores of the sequence following the same ascii encoding than the
fastq format. All the other columns will be considered as annotations and will
be interpreted as JSON objects encoding potentially for atomic values. If a
calumn value can not be decoded as JSON it will be considered as a string.
- A new option **--version** has been added to every obitools command. It will
print the version of the command.

1
go.mod
View File

@ -32,6 +32,7 @@ require (
github.com/klauspost/cpuid/v2 v2.0.9 // indirect
github.com/kr/pretty v0.3.0 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/montanaflynn/stats v0.7.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rogpeppe/go-internal v1.6.1 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect

2
go.sum
View File

@ -58,6 +58,8 @@ github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZ
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE=
github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow=
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=

View File

@ -7,7 +7,7 @@ import (
// TODO: The version number is extracted from git. This induces that the version
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "2bc540d"
var _Commit = "a365bb6"
var _Version = "Release 4.2.0"
// Version returns the version of the obitools package.

View File

@ -49,6 +49,8 @@ func MakeStatsOnDescription(descriptor string) StatsOnDescription {
}
}
var _merge_prefix = "merged_"
// StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute.
//
// Parameters:
@ -57,7 +59,7 @@ func MakeStatsOnDescription(descriptor string) StatsOnDescription {
// Return type:
// - string
func StatsOnSlotName(key string) string {
return "merged_" + key
return _merge_prefix + key
}
// HasStatsOn tests if the sequence has already a slot summarizing statistics of occurrence for a given attribute.
@ -115,12 +117,12 @@ func (sequence *BioSequence) StatsOn(desc StatsOnDescription, na string) StatsOn
}
}
default:
stats = make(StatsOnValues, 100)
stats = make(StatsOnValues, 10)
annotations[mkey] = stats
newstat = true
}
} else {
stats = make(StatsOnValues, 100)
stats = make(StatsOnValues, 10)
annotations[mkey] = stats
newstat = true
}
@ -165,7 +167,7 @@ func (sequence *BioSequence) StatsPlusOne(desc StatsOnDescription, toAdd *BioSeq
log.Fatalf("Trying to make stats on a float value (%v : %T)", value, value)
}
default:
log.Fatalf("Trying to make stats on a none string, integer or boolean value (%v : %T)", value, value)
log.Fatalf("Trying to make stats not on a string, integer or boolean value (%v : %T)", value, value)
}
retval = true
}
@ -234,7 +236,7 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
if tomerge.HasAnnotation() {
ma := tomerge.Annotations()
for k, va := range annotations {
if !strings.HasPrefix(k, "merged_") {
if !strings.HasPrefix(k, _merge_prefix) {
vm, ok := ma[k]
if ok {
switch vm := vm.(type) {
@ -255,7 +257,7 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
}
} else {
for k := range annotations {
if !strings.HasPrefix(k, "merged_") {
if !strings.HasPrefix(k, _merge_prefix) {
delete(annotations, k)
}
}

113
pkg/obistats/algo.go Normal file
View File

@ -0,0 +1,113 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
// Miscellaneous helper algorithms
import (
"fmt"
)
func maxint(a, b int) int {
if a > b {
return a
}
return b
}
func minint(a, b int) int {
if a < b {
return a
}
return b
}
func sumint(xs []int) int {
sum := 0
for _, x := range xs {
sum += x
}
return sum
}
// bisect returns an x in [low, high] such that |f(x)| <= tolerance
// using the bisection method.
//
// f(low) and f(high) must have opposite signs.
//
// If f does not have a root in this interval (e.g., it is
// discontiguous), this returns the X of the apparent discontinuity
// and false.
func bisect(f func(float64) float64, low, high, tolerance float64) (float64, bool) {
flow, fhigh := f(low), f(high)
if -tolerance <= flow && flow <= tolerance {
return low, true
}
if -tolerance <= fhigh && fhigh <= tolerance {
return high, true
}
if mathSign(flow) == mathSign(fhigh) {
panic(fmt.Sprintf("root of f is not bracketed by [low, high]; f(%g)=%g f(%g)=%g", low, flow, high, fhigh))
}
for {
mid := (high + low) / 2
fmid := f(mid)
if -tolerance <= fmid && fmid <= tolerance {
return mid, true
}
if mid == high || mid == low {
return mid, false
}
if mathSign(fmid) == mathSign(flow) {
low = mid
flow = fmid
} else {
high = mid
fhigh = fmid
}
}
}
// bisectBool implements the bisection method on a boolean function.
// It returns x1, x2 ∈ [low, high], x1 < x2 such that f(x1) != f(x2)
// and x2 - x1 <= xtol.
//
// If f(low) == f(high), it panics.
func bisectBool(f func(float64) bool, low, high, xtol float64) (x1, x2 float64) {
flow, fhigh := f(low), f(high)
if flow == fhigh {
panic(fmt.Sprintf("root of f is not bracketed by [low, high]; f(%g)=%v f(%g)=%v", low, flow, high, fhigh))
}
for {
if high-low <= xtol {
return low, high
}
mid := (high + low) / 2
if mid == high || mid == low {
return low, high
}
fmid := f(mid)
if fmid == flow {
low = mid
flow = fmid
} else {
high = mid
fhigh = fmid
}
}
}
// series returns the sum of the series f(0), f(1), ...
//
// This implementation is fast, but subject to round-off error.
func series(f func(float64) float64) float64 {
y, yp := 0.0, 1.0
for n := 0.0; y != yp; n++ {
yp = y
y += f(n)
}
return y
}

93
pkg/obistats/beta.go Normal file
View File

@ -0,0 +1,93 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import "math"
func lgamma(x float64) float64 {
y, _ := math.Lgamma(x)
return y
}
// mathBeta returns the value of the complete beta function B(a, b).
func mathBeta(a, b float64) float64 {
// B(x,y) = Γ(x)Γ(y) / Γ(x+y)
return math.Exp(lgamma(a) + lgamma(b) - lgamma(a+b))
}
// mathBetaInc returns the value of the regularized incomplete beta
// function Iₓ(a, b).
//
// This is not to be confused with the "incomplete beta function",
// which can be computed as BetaInc(x, a, b)*Beta(a, b).
//
// If x < 0 or x > 1, returns NaN.
func mathBetaInc(x, a, b float64) float64 {
// Based on Numerical Recipes in C, section 6.4. This uses the
// continued fraction definition of I:
//
// (xᵃ*(1-x)ᵇ)/(a*B(a,b)) * (1/(1+(d₁/(1+(d₂/(1+...))))))
//
// where B(a,b) is the beta function and
//
// d_{2m+1} = -(a+m)(a+b+m)x/((a+2m)(a+2m+1))
// d_{2m} = m(b-m)x/((a+2m-1)(a+2m))
if x < 0 || x > 1 {
return math.NaN()
}
bt := 0.0
if 0 < x && x < 1 {
// Compute the coefficient before the continued
// fraction.
bt = math.Exp(lgamma(a+b) - lgamma(a) - lgamma(b) +
a*math.Log(x) + b*math.Log(1-x))
}
if x < (a+1)/(a+b+2) {
// Compute continued fraction directly.
return bt * betacf(x, a, b) / a
}
// Compute continued fraction after symmetry transform.
return 1 - bt*betacf(1-x, b, a)/b
}
// betacf is the continued fraction component of the regularized
// incomplete beta function Iₓ(a, b).
func betacf(x, a, b float64) float64 {
const maxIterations = 200
const epsilon = 3e-14
raiseZero := func(z float64) float64 {
if math.Abs(z) < math.SmallestNonzeroFloat64 {
return math.SmallestNonzeroFloat64
}
return z
}
c := 1.0
d := 1 / raiseZero(1-(a+b)*x/(a+1))
h := d
for m := 1; m <= maxIterations; m++ {
mf := float64(m)
// Even step of the recurrence.
numer := mf * (b - mf) * x / ((a + 2*mf - 1) * (a + 2*mf))
d = 1 / raiseZero(1+numer*d)
c = raiseZero(1 + numer/c)
h *= d * c
// Odd step of the recurrence.
numer = -(a + mf) * (a + b + mf) * x / ((a + 2*mf) * (a + 2*mf + 1))
d = 1 / raiseZero(1+numer*d)
c = raiseZero(1 + numer/c)
hfac := d * c
h *= hfac
if math.Abs(hfac-1) < epsilon {
return h
}
}
panic("betainc: a or b too big; failed to converge")
}

225
pkg/obistats/data.go Normal file
View File

@ -0,0 +1,225 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import (
"fmt"
)
// A Collection is a collection of benchmark results.
type Collection struct {
// Configs, Groups, and Units give the set of configs,
// groups, and units from the keys in Stats in an order
// meant to match the order the benchmarks were read in.
Configs, Groups, Units []string
// Benchmarks gives the set of benchmarks from the keys in
// Stats by group in an order meant to match the order
// benchmarks were read in.
Benchmarks map[string][]string
// Metrics holds the accumulated metrics for each key.
Metrics map[Key]*Metrics
// DeltaTest is the test to use to decide if a change is significant.
// If nil, it defaults to UTest.
DeltaTest DeltaTest
// Alpha is the p-value cutoff to report a change as significant.
// If zero, it defaults to 0.05.
Alpha float64
// AddGeoMean specifies whether to add a line to the table
// showing the geometric mean of all the benchmark results.
AddGeoMean bool
// SplitBy specifies the labels to split results by.
// By default, results will only be split by full name.
SplitBy []string
// Order specifies the row display order for this table.
// If Order is nil, the table rows are printed in order of
// first appearance in the input.
Order Order
}
// A Key identifies one metric (e.g., "ns/op", "B/op") from one
// benchmark (function name sans "Benchmark" prefix) and optional
// group in one configuration (input file name).
type Key struct {
Config, Group, Benchmark, Unit string
}
// A Metrics holds the measurements of a single metric
// (for example, ns/op or MB/s)
// for all runs of a particular benchmark.
type Metrics struct {
Unit string // unit being measured
Values []float64 // measured values
RValues []float64 // Values with outliers removed
Min float64 // min of RValues
Mean float64 // mean of RValues
Max float64 // max of RValues
}
// FormatMean formats m.Mean using scaler.
func (m *Metrics) FormatMean(scaler Scaler) string {
var s string
if scaler != nil {
s = scaler(m.Mean)
} else {
s = fmt.Sprint(m.Mean)
}
return s
}
// FormatDiff computes and formats the percent variation of max and min compared to mean.
// If b.Mean or b.Max is zero, FormatDiff returns an empty string.
func (m *Metrics) FormatDiff() string {
if m.Mean == 0 || m.Max == 0 {
return ""
}
diff := 1 - m.Min/m.Mean
if d := m.Max/m.Mean - 1; d > diff {
diff = d
}
return fmt.Sprintf("±%.0f%%", diff*100.0)
}
// Format returns a textual formatting of "Mean ±Diff" using scaler.
func (m *Metrics) Format(scaler Scaler) string {
if m.Unit == "" {
return ""
}
mean := m.FormatMean(scaler)
diff := m.FormatDiff()
if diff == "" {
return mean + " "
}
return fmt.Sprintf("%s %3s", mean, diff)
}
// computeStats updates the derived statistics in m from the raw
// samples in m.Values.
func (m *Metrics) computeStats() {
// Discard outliers.
values := Sample{Xs: m.Values}
q1, q3 := values.Percentile(0.25), values.Percentile(0.75)
lo, hi := q1-1.5*(q3-q1), q3+1.5*(q3-q1)
for _, value := range m.Values {
if lo <= value && value <= hi {
m.RValues = append(m.RValues, value)
}
}
// Compute statistics of remaining data.
m.Min, m.Max = Bounds(m.RValues)
m.Mean = Mean(m.RValues)
}
// addMetrics returns the metrics with the given key from c,
// creating a new one if needed.
func (c *Collection) addMetrics(key Key) *Metrics {
if c.Metrics == nil {
c.Metrics = make(map[Key]*Metrics)
}
if stat, ok := c.Metrics[key]; ok {
return stat
}
addString := func(strings *[]string, add string) {
for _, s := range *strings {
if s == add {
return
}
}
*strings = append(*strings, add)
}
addString(&c.Configs, key.Config)
addString(&c.Groups, key.Group)
if c.Benchmarks == nil {
c.Benchmarks = make(map[string][]string)
}
benchmarks := c.Benchmarks[key.Group]
addString(&benchmarks, key.Benchmark)
c.Benchmarks[key.Group] = benchmarks
addString(&c.Units, key.Unit)
m := &Metrics{Unit: key.Unit}
c.Metrics[key] = m
return m
}
// // AddFile adds the benchmark results in the formatted data
// // (read from the reader r) to the named configuration.
// func (c *Collection) AddFile(config string, f io.Reader) error {
// c.Configs = append(c.Configs, config)
// key := Key{Config: config}
// br := benchfmt.NewReader(f)
// for br.Next() {
// c.addResult(key, br.Result())
// }
// return br.Err()
// }
// // AddData adds the benchmark results in the formatted data
// // (read from the reader r) to the named configuration.
// func (c *Collection) AddData(config string, data []byte) error {
// return c.AddFile(config, bytes.NewReader(data))
// }
// AddResults adds the benchmark results to the named configuration.
// func (c *Collection) AddResults(config string, results []*benchfmt.Result) {
// c.Configs = append(c.Configs, config)
// key := Key{Config: config}
// for _, r := range results {
// c.addResult(key, r)
// }
// }
// func (c *Collection) addResult(key Key, r *benchfmt.Result) {
// f := strings.Fields(r.Content)
// if len(f) < 4 {
// return
// }
// name := f[0]
// if !strings.HasPrefix(name, "Benchmark") {
// return
// }
// name = strings.TrimPrefix(name, "Benchmark")
// n, _ := strconv.Atoi(f[1])
// if n == 0 {
// return
// }
// key.Group = c.makeGroup(r)
// key.Benchmark = name
// for i := 2; i+2 <= len(f); i += 2 {
// val, err := strconv.ParseFloat(f[i], 64)
// if err != nil {
// continue
// }
// key.Unit = f[i+1]
// m := c.addMetrics(key)
// m.Values = append(m.Values, val)
// }
// }
// func (c *Collection) makeGroup(r *benchfmt.Result) string {
// var out string
// for _, s := range c.SplitBy {
// v := r.NameLabels[s]
// if v == "" {
// v = r.Labels[s]
// }
// if v != "" {
// if out != "" {
// out = out + " "
// }
// out += fmt.Sprintf("%s:%s", s, v)
// }
// }
// return out
// }

54
pkg/obistats/delta.go Normal file
View File

@ -0,0 +1,54 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import "errors"
// A DeltaTest compares the old and new metrics and returns the
// expected probability that they are drawn from the same distribution.
//
// If a probability cannot be computed, the DeltaTest returns an
// error explaining why. Common errors include ErrSamplesEqual
// (all samples are equal), ErrSampleSize (there aren't enough samples),
// and ErrZeroVariance (the sample has zero variance).
//
// As a special case, the missing test NoDeltaTest returns -1, nil.
type DeltaTest func(old, new *Metrics) (float64, error)
// Errors returned by DeltaTest.
var (
ErrSamplesEqual = errors.New("all equal")
ErrSampleSize = errors.New("too few samples")
ErrZeroVariance = errors.New("zero variance")
ErrMismatchedSamples = errors.New("samples have different lengths")
)
// NoDeltaTest applies no delta test; it returns -1, nil.
func NoDeltaTest(old, new *Metrics) (pval float64, err error) {
return -1, nil
}
// TTest is a DeltaTest using the two-sample Welch t-test.
func TTest(old, new *Metrics) (pval float64, err error) {
t, err := TwoSampleWelchTTest(
Sample{Xs: old.RValues},
Sample{Xs: new.RValues},
LocationDiffers,
)
if err != nil {
return -1, err
}
return t.P, nil
}
// UTest is a DeltaTest using the Mann-Whitney U test.
func UTest(old, new *Metrics) (pval float64, err error) {
u, err := MannWhitneyUTest(old.RValues, new.RValues, LocationDiffers)
if err != nil {
return -1, err
}
return u.P, nil
}

275
pkg/obistats/mannwhitney.go Normal file
View File

@ -0,0 +1,275 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import (
"math"
"sort"
)
// A LocationHypothesis specifies the alternative hypothesis of a
// location test such as a t-test or a Mann-Whitney U-test. The
// default (zero) value is to test against the alternative hypothesis
// that they differ.
type LocationHypothesis int
//go:generate stringer -type LocationHypothesis
const (
// LocationLess specifies the alternative hypothesis that the
// location of the first sample is less than the second. This
// is a one-tailed test.
LocationLess LocationHypothesis = -1
// LocationDiffers specifies the alternative hypothesis that
// the locations of the two samples are not equal. This is a
// two-tailed test.
LocationDiffers LocationHypothesis = 0
// LocationGreater specifies the alternative hypothesis that
// the location of the first sample is greater than the
// second. This is a one-tailed test.
LocationGreater LocationHypothesis = 1
)
// A MannWhitneyUTestResult is the result of a Mann-Whitney U-test.
type MannWhitneyUTestResult struct {
// N1 and N2 are the sizes of the input samples.
N1, N2 int
// U is the value of the Mann-Whitney U statistic for this
// test, generalized by counting ties as 0.5.
//
// Given the Cartesian product of the two samples, this is the
// number of pairs in which the value from the first sample is
// greater than the value of the second, plus 0.5 times the
// number of pairs where the values from the two samples are
// equal. Hence, U is always an integer multiple of 0.5 (it is
// a whole integer if there are no ties) in the range [0, N1*N2].
//
// U statistics always come in pairs, depending on which
// sample is "first". The mirror U for the other sample can be
// calculated as N1*N2 - U.
//
// There are many equivalent statistics with slightly
// different definitions. The Wilcoxon (1945) W statistic
// (generalized for ties) is U + (N1(N1+1))/2. It is also
// common to use 2U to eliminate the half steps and Smid
// (1956) uses N1*N2 - 2U to additionally center the
// distribution.
U float64
// AltHypothesis specifies the alternative hypothesis tested
// by this test against the null hypothesis that there is no
// difference in the locations of the samples.
AltHypothesis LocationHypothesis
// P is the p-value of the Mann-Whitney test for the given
// null hypothesis.
P float64
}
// MannWhitneyExactLimit gives the largest sample size for which the
// exact U distribution will be used for the Mann-Whitney U-test.
//
// Using the exact distribution is necessary for small sample sizes
// because the distribution is highly irregular. However, computing
// the distribution for large sample sizes is both computationally
// expensive and unnecessary because it quickly approaches a normal
// approximation. Computing the distribution for two 50 value samples
// takes a few milliseconds on a 2014 laptop.
var MannWhitneyExactLimit = 50
// MannWhitneyTiesExactLimit gives the largest sample size for which
// the exact U distribution will be used for the Mann-Whitney U-test
// in the presence of ties.
//
// Computing this distribution is more expensive than computing the
// distribution without ties, so this is set lower. Computing this
// distribution for two 25 value samples takes about ten milliseconds
// on a 2014 laptop.
var MannWhitneyTiesExactLimit = 25
// MannWhitneyUTest performs a Mann-Whitney U-test [1,2] of the null
// hypothesis that two samples come from the same population against
// the alternative hypothesis that one sample tends to have larger or
// smaller values than the other.
//
// This is similar to a t-test, but unlike the t-test, the
// Mann-Whitney U-test is non-parametric (it does not assume a normal
// distribution). It has very slightly lower efficiency than the
// t-test on normal distributions.
//
// Computing the exact U distribution is expensive for large sample
// sizes, so this uses a normal approximation for sample sizes larger
// than MannWhitneyExactLimit if there are no ties or
// MannWhitneyTiesExactLimit if there are ties. This normal
// approximation uses both the tie correction and the continuity
// correction.
//
// This can fail with ErrSampleSize if either sample is empty or
// ErrSamplesEqual if all sample values are equal.
//
// This is also known as a Mann-Whitney-Wilcoxon test and is
// equivalent to the Wilcoxon rank-sum test, though the Wilcoxon
// rank-sum test differs in nomenclature.
//
// [1] Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of
// Whether one of Two Random Variables is Stochastically Larger than
// the Other". Annals of Mathematical Statistics 18 (1): 5060.
//
// [2] Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer".
// Journal of the American Statistical Association 61 (315): 772-787.
func MannWhitneyUTest(x1, x2 []float64, alt LocationHypothesis) (*MannWhitneyUTestResult, error) {
n1, n2 := len(x1), len(x2)
if n1 == 0 || n2 == 0 {
return nil, ErrSampleSize
}
// Compute the U statistic and tie vector T.
x1 = append([]float64(nil), x1...)
x2 = append([]float64(nil), x2...)
sort.Float64s(x1)
sort.Float64s(x2)
merged, labels := labeledMerge(x1, x2)
R1 := 0.0
T, hasTies := []int{}, false
for i := 0; i < len(merged); {
rank1, nx1, v1 := i+1, 0, merged[i]
// Consume samples that tie this sample (including itself).
for ; i < len(merged) && merged[i] == v1; i++ {
if labels[i] == 1 {
nx1++
}
}
// Assign all tied samples the average rank of the
// samples, where merged[0] has rank 1.
if nx1 != 0 {
rank := float64(i+rank1) / 2
R1 += rank * float64(nx1)
}
T = append(T, i-rank1+1)
if i > rank1 {
hasTies = true
}
}
U1 := R1 - float64(n1*(n1+1))/2
// Compute the smaller of U1 and U2
U2 := float64(n1*n2) - U1
Usmall := math.Min(U1, U2)
var p float64
if !hasTies && n1 <= MannWhitneyExactLimit && n2 <= MannWhitneyExactLimit ||
hasTies && n1 <= MannWhitneyTiesExactLimit && n2 <= MannWhitneyTiesExactLimit {
// Use exact U distribution. U1 will be an integer.
if len(T) == 1 {
// All values are equal. Test is meaningless.
return nil, ErrSamplesEqual
}
dist := UDist{N1: n1, N2: n2, T: T}
switch alt {
case LocationDiffers:
if U1 == U2 {
// The distribution is symmetric about
// Usmall. Since the distribution is
// discrete, the CDF is discontinuous
// and if simply double CDF(Usmall),
// we'll double count the
// (non-infinitesimal) probability
// mass at Usmall. What we want is
// just the integral of the whole CDF,
// which is 1.
p = 1
} else {
p = dist.CDF(Usmall) * 2
}
case LocationLess:
p = dist.CDF(U1)
case LocationGreater:
p = 1 - dist.CDF(U1-1)
}
} else {
// Use normal approximation (with tie and continuity
// correction).
t := tieCorrection(T)
N := float64(n1 + n2)
μU := float64(n1*n2) / 2
σU := math.Sqrt(float64(n1*n2) * ((N + 1) - t/(N*(N-1))) / 12)
if σU == 0 {
return nil, ErrSamplesEqual
}
numer := U1 - μU
// Perform continuity correction.
switch alt {
case LocationDiffers:
numer -= mathSign(numer) * 0.5
case LocationLess:
numer += 0.5
case LocationGreater:
numer -= 0.5
}
z := numer / σU
switch alt {
case LocationDiffers:
p = 2 * math.Min(StdNormal.CDF(z), 1-StdNormal.CDF(z))
case LocationLess:
p = StdNormal.CDF(z)
case LocationGreater:
p = 1 - StdNormal.CDF(z)
}
}
return &MannWhitneyUTestResult{N1: n1, N2: n2, U: U1,
AltHypothesis: alt, P: p}, nil
}
// labeledMerge merges sorted lists x1 and x2 into sorted list merged.
// labels[i] is 1 or 2 depending on whether merged[i] is a value from
// x1 or x2, respectively.
func labeledMerge(x1, x2 []float64) (merged []float64, labels []byte) {
merged = make([]float64, len(x1)+len(x2))
labels = make([]byte, len(x1)+len(x2))
i, j, o := 0, 0, 0
for i < len(x1) && j < len(x2) {
if x1[i] < x2[j] {
merged[o] = x1[i]
labels[o] = 1
i++
} else {
merged[o] = x2[j]
labels[o] = 2
j++
}
o++
}
for ; i < len(x1); i++ {
merged[o] = x1[i]
labels[o] = 1
o++
}
for ; j < len(x2); j++ {
merged[o] = x2[j]
labels[o] = 2
o++
}
return
}
// tieCorrection computes the tie correction factor Σ_j (t_j³ - t_j)
// where t_j is the number of ties in the j'th rank.
func tieCorrection(ties []int) float64 {
t := 0
for _, tie := range ties {
t += tie*tie*tie - tie
}
return float64(t)
}

79
pkg/obistats/mathx.go Normal file
View File

@ -0,0 +1,79 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import "math"
var inf = math.Inf(1)
var nan = math.NaN()
// mathSign returns the sign of x: -1 if x < 0, 0 if x == 0, 1 if x > 0.
// If x is NaN, it returns NaN.
func mathSign(x float64) float64 {
if x == 0 {
return 0
} else if x < 0 {
return -1
} else if x > 0 {
return 1
}
return nan
}
const smallFactLimit = 20 // 20! => 62 bits
var smallFact [smallFactLimit + 1]int64
func init() {
smallFact[0] = 1
fact := int64(1)
for n := int64(1); n <= smallFactLimit; n++ {
fact *= n
smallFact[n] = fact
}
}
// mathChoose returns the binomial coefficient of n and k.
func mathChoose(n, k int) float64 {
if k == 0 || k == n {
return 1
}
if k < 0 || n < k {
return 0
}
if n <= smallFactLimit { // Implies k <= smallFactLimit
// It's faster to do several integer multiplications
// than it is to do an extra integer division.
// Remarkably, this is also faster than pre-computing
// Pascal's triangle (presumably because this is very
// cache efficient).
numer := int64(1)
for n1 := int64(n - (k - 1)); n1 <= int64(n); n1++ {
numer *= n1
}
denom := smallFact[k]
return float64(numer / denom)
}
return math.Exp(lchoose(n, k))
}
// mathLchoose returns math.Log(mathChoose(n, k)).
func mathLchoose(n, k int) float64 {
if k == 0 || k == n {
return 0
}
if k < 0 || n < k {
return math.NaN()
}
return lchoose(n, k)
}
func lchoose(n, k int) float64 {
a, _ := math.Lgamma(float64(n + 1))
b, _ := math.Lgamma(float64(k + 1))
c, _ := math.Lgamma(float64(n - k + 1))
return a - b - c
}

142
pkg/obistats/normaldist.go Normal file
View File

@ -0,0 +1,142 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import (
"math"
"math/rand"
)
// NormalDist is a normal (Gaussian) distribution with mean Mu and
// standard deviation Sigma.
type NormalDist struct {
Mu, Sigma float64
}
// StdNormal is the standard normal distribution (Mu = 0, Sigma = 1)
var StdNormal = NormalDist{0, 1}
// 1/sqrt(2 * pi)
const invSqrt2Pi = 0.39894228040143267793994605993438186847585863116493465766592583
func (n NormalDist) PDF(x float64) float64 {
z := x - n.Mu
return math.Exp(-z*z/(2*n.Sigma*n.Sigma)) * invSqrt2Pi / n.Sigma
}
func (n NormalDist) pdfEach(xs []float64) []float64 {
res := make([]float64, len(xs))
if n.Mu == 0 && n.Sigma == 1 {
// Standard normal fast path
for i, x := range xs {
res[i] = math.Exp(-x*x/2) * invSqrt2Pi
}
} else {
a := -1 / (2 * n.Sigma * n.Sigma)
b := invSqrt2Pi / n.Sigma
for i, x := range xs {
z := x - n.Mu
res[i] = math.Exp(z*z*a) * b
}
}
return res
}
func (n NormalDist) CDF(x float64) float64 {
return math.Erfc(-(x-n.Mu)/(n.Sigma*math.Sqrt2)) / 2
}
func (n NormalDist) cdfEach(xs []float64) []float64 {
res := make([]float64, len(xs))
a := 1 / (n.Sigma * math.Sqrt2)
for i, x := range xs {
res[i] = math.Erfc(-(x-n.Mu)*a) / 2
}
return res
}
func (n NormalDist) InvCDF(p float64) (x float64) {
// This is based on Peter John Acklam's inverse normal CDF
// algorithm: http://home.online.no/~pjacklam/notes/invnorm/
const (
a1 = -3.969683028665376e+01
a2 = 2.209460984245205e+02
a3 = -2.759285104469687e+02
a4 = 1.383577518672690e+02
a5 = -3.066479806614716e+01
a6 = 2.506628277459239e+00
b1 = -5.447609879822406e+01
b2 = 1.615858368580409e+02
b3 = -1.556989798598866e+02
b4 = 6.680131188771972e+01
b5 = -1.328068155288572e+01
c1 = -7.784894002430293e-03
c2 = -3.223964580411365e-01
c3 = -2.400758277161838e+00
c4 = -2.549732539343734e+00
c5 = 4.374664141464968e+00
c6 = 2.938163982698783e+00
d1 = 7.784695709041462e-03
d2 = 3.224671290700398e-01
d3 = 2.445134137142996e+00
d4 = 3.754408661907416e+00
plow = 0.02425
phigh = 1 - plow
)
if p < 0 || p > 1 {
return nan
} else if p == 0 {
return -inf
} else if p == 1 {
return inf
}
if p < plow {
// Rational approximation for lower region.
q := math.Sqrt(-2 * math.Log(p))
x = (((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q + c6) /
((((d1*q+d2)*q+d3)*q+d4)*q + 1)
} else if phigh < p {
// Rational approximation for upper region.
q := math.Sqrt(-2 * math.Log(1-p))
x = -(((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q + c6) /
((((d1*q+d2)*q+d3)*q+d4)*q + 1)
} else {
// Rational approximation for central region.
q := p - 0.5
r := q * q
x = (((((a1*r+a2)*r+a3)*r+a4)*r+a5)*r + a6) * q /
(((((b1*r+b2)*r+b3)*r+b4)*r+b5)*r + 1)
}
// Refine approximation.
e := 0.5*math.Erfc(-x/math.Sqrt2) - p
u := e * math.Sqrt(2*math.Pi) * math.Exp(x*x/2)
x = x - u/(1+x*u/2)
// Adjust from standard normal.
return x*n.Sigma + n.Mu
}
func (n NormalDist) Rand(r *rand.Rand) float64 {
var x float64
if r == nil {
x = rand.NormFloat64()
} else {
x = r.NormFloat64()
}
return x*n.Sigma + n.Mu
}
func (n NormalDist) Bounds() (float64, float64) {
const stddevs = 3
return n.Mu - stddevs*n.Sigma, n.Mu + stddevs*n.Sigma
}

341
pkg/obistats/sample.go Normal file
View File

@ -0,0 +1,341 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import (
"math"
"sort"
)
// Sample is a collection of possibly weighted data points.
type Sample struct {
// Xs is the slice of sample values.
Xs []float64
// Weights[i] is the weight of sample Xs[i]. If Weights is
// nil, all Xs have weight 1. Weights must have the same
// length of Xs and all values must be non-negative.
Weights []float64
// Sorted indicates that Xs is sorted in ascending order.
Sorted bool
}
// Bounds returns the minimum and maximum values of xs.
func Bounds(xs []float64) (min float64, max float64) {
if len(xs) == 0 {
return math.NaN(), math.NaN()
}
min, max = xs[0], xs[0]
for _, x := range xs {
if x < min {
min = x
}
if x > max {
max = x
}
}
return
}
// Bounds returns the minimum and maximum values of the Sample.
//
// If the Sample is weighted, this ignores samples with zero weight.
//
// This is constant time if s.Sorted and there are no zero-weighted
// values.
func (s Sample) Bounds() (min float64, max float64) {
if len(s.Xs) == 0 || (!s.Sorted && s.Weights == nil) {
return Bounds(s.Xs)
}
if s.Sorted {
if s.Weights == nil {
return s.Xs[0], s.Xs[len(s.Xs)-1]
}
min, max = math.NaN(), math.NaN()
for i, w := range s.Weights {
if w != 0 {
min = s.Xs[i]
break
}
}
if math.IsNaN(min) {
return
}
for i := range s.Weights {
if s.Weights[len(s.Weights)-i-1] != 0 {
max = s.Xs[len(s.Weights)-i-1]
break
}
}
} else {
min, max = math.Inf(1), math.Inf(-1)
for i, x := range s.Xs {
w := s.Weights[i]
if x < min && w != 0 {
min = x
}
if x > max && w != 0 {
max = x
}
}
if math.IsInf(min, 0) {
min, max = math.NaN(), math.NaN()
}
}
return
}
// vecSum returns the sum of xs.
func vecSum(xs []float64) float64 {
sum := 0.0
for _, x := range xs {
sum += x
}
return sum
}
// Sum returns the (possibly weighted) sum of the Sample.
func (s Sample) Sum() float64 {
if s.Weights == nil {
return vecSum(s.Xs)
}
sum := 0.0
for i, x := range s.Xs {
sum += x * s.Weights[i]
}
return sum
}
// Weight returns the total weight of the Sasmple.
func (s Sample) Weight() float64 {
if s.Weights == nil {
return float64(len(s.Xs))
}
return vecSum(s.Weights)
}
// // Mean returns the arithmetic mean of xs.
// func Mean(xs []float64) float64 {
// if len(xs) == 0 {
// return math.NaN()
// }
// m := 0.0
// for i, x := range xs {
// m += (x - m) / float64(i+1)
// }
// return m
// }
// Mean returns the arithmetic mean of the Sample.
func (s Sample) Mean() float64 {
if len(s.Xs) == 0 || s.Weights == nil {
return Mean(s.Xs)
}
m, wsum := 0.0, 0.0
for i, x := range s.Xs {
// Use weighted incremental mean:
// m_i = (1 - w_i/wsum_i) * m_(i-1) + (w_i/wsum_i) * x_i
// = m_(i-1) + (x_i - m_(i-1)) * (w_i/wsum_i)
w := s.Weights[i]
wsum += w
m += (x - m) * w / wsum
}
return m
}
// GeoMean returns the geometric mean of xs. xs must be positive.
func GeoMean(xs []float64) float64 {
if len(xs) == 0 {
return math.NaN()
}
m := 0.0
for i, x := range xs {
if x <= 0 {
return math.NaN()
}
lx := math.Log(x)
m += (lx - m) / float64(i+1)
}
return math.Exp(m)
}
// GeoMean returns the geometric mean of the Sample. All samples
// values must be positive.
func (s Sample) GeoMean() float64 {
if len(s.Xs) == 0 || s.Weights == nil {
return GeoMean(s.Xs)
}
m, wsum := 0.0, 0.0
for i, x := range s.Xs {
w := s.Weights[i]
wsum += w
lx := math.Log(x)
m += (lx - m) * w / wsum
}
return math.Exp(m)
}
// Variance returns the sample variance of xs.
func Variance(xs []float64) float64 {
if len(xs) == 0 {
return math.NaN()
} else if len(xs) <= 1 {
return 0
}
// Based on Wikipedia's presentation of Welford 1962
// (http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm).
// This is more numerically stable than the standard two-pass
// formula and not prone to massive cancellation.
mean, M2 := 0.0, 0.0
for n, x := range xs {
delta := x - mean
mean += delta / float64(n+1)
M2 += delta * (x - mean)
}
return M2 / float64(len(xs)-1)
}
// Variance returns the sample variance of xs.
func (s Sample) Variance() float64 {
if len(s.Xs) == 0 || s.Weights == nil {
return Variance(s.Xs)
}
// TODO(austin)
panic("Weighted Variance not implemented")
}
// StdDev returns the sample standard deviation of xs.
func StdDev(xs []float64) float64 {
return math.Sqrt(Variance(xs))
}
// StdDev returns the sample standard deviation of the Sample.
func (s Sample) StdDev() float64 {
if len(s.Xs) == 0 || s.Weights == nil {
return StdDev(s.Xs)
}
// TODO(austin)
panic("Weighted StdDev not implemented")
}
// Percentile returns the pctileth value from the Sample. This uses
// interpolation method R8 from Hyndman and Fan (1996).
//
// pctile will be capped to the range [0, 1]. If len(xs) == 0 or all
// weights are 0, returns NaN.
//
// Percentile(0.5) is the median. Percentile(0.25) and
// Percentile(0.75) are the first and third quartiles, respectively.
//
// This is constant time if s.Sorted and s.Weights == nil.
func (s Sample) Percentile(pctile float64) float64 {
if len(s.Xs) == 0 {
return math.NaN()
} else if pctile <= 0 {
min, _ := s.Bounds()
return min
} else if pctile >= 1 {
_, max := s.Bounds()
return max
}
if !s.Sorted {
// TODO(austin) Use select algorithm instead
s = *s.Copy().Sort()
}
if s.Weights == nil {
N := float64(len(s.Xs))
//n := pctile * (N + 1) // R6
n := 1/3.0 + pctile*(N+1/3.0) // R8
kf, frac := math.Modf(n)
k := int(kf)
if k <= 0 {
return s.Xs[0]
} else if k >= len(s.Xs) {
return s.Xs[len(s.Xs)-1]
}
return s.Xs[k-1] + frac*(s.Xs[k]-s.Xs[k-1])
}
// TODO(austin): Implement interpolation
target := s.Weight() * pctile
// TODO(austin) If we had cumulative weights, we could
// do this in log time.
for i, weight := range s.Weights {
target -= weight
if target < 0 {
return s.Xs[i]
}
}
return s.Xs[len(s.Xs)-1]
}
// IQR returns the interquartile range of the Sample.
//
// This is constant time if s.Sorted and s.Weights == nil.
func (s Sample) IQR() float64 {
if !s.Sorted {
s = *s.Copy().Sort()
}
return s.Percentile(0.75) - s.Percentile(0.25)
}
type sampleSorter struct {
xs []float64
weights []float64
}
func (p *sampleSorter) Len() int {
return len(p.xs)
}
func (p *sampleSorter) Less(i, j int) bool {
return p.xs[i] < p.xs[j]
}
func (p *sampleSorter) Swap(i, j int) {
p.xs[i], p.xs[j] = p.xs[j], p.xs[i]
p.weights[i], p.weights[j] = p.weights[j], p.weights[i]
}
// Sort sorts the samples in place in s and returns s.
//
// A sorted sample improves the performance of some algorithms.
func (s *Sample) Sort() *Sample {
if s.Sorted || sort.Float64sAreSorted(s.Xs) {
// All set
} else if s.Weights == nil {
sort.Float64s(s.Xs)
} else {
sort.Sort(&sampleSorter{s.Xs, s.Weights})
}
s.Sorted = true
return s
}
// Copy returns a copy of the Sample.
//
// The returned Sample shares no data with the original, so they can
// be modified (for example, sorted) independently.
func (s Sample) Copy() *Sample {
xs := make([]float64, len(s.Xs))
copy(xs, s.Xs)
weights := []float64(nil)
if s.Weights != nil {
weights = make([]float64, len(s.Weights))
copy(weights, s.Weights)
}
return &Sample{xs, weights, s.Sorted}
}

119
pkg/obistats/scaler.go Normal file
View File

@ -0,0 +1,119 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import (
"fmt"
"strings"
)
// A Scaler is a function that scales and formats a measurement.
// All measurements within a given table row are formatted
// using the same scaler, so that the units are consistent
// across the row.
type Scaler func(float64) string
// NewScaler returns a Scaler appropriate for formatting
// the measurement val, which has the given unit.
func NewScaler(val float64, unit string) Scaler {
if hasBaseUnit(unit, "ns/op") || hasBaseUnit(unit, "ns/GC") {
return timeScaler(val)
}
var format string
var scale float64
var suffix string
prescale := 1.0
if hasBaseUnit(unit, "MB/s") {
prescale = 1e6
}
switch x := val * prescale; {
case x >= 99500000000000:
format, scale, suffix = "%.0f", 1e12, "T"
case x >= 9950000000000:
format, scale, suffix = "%.1f", 1e12, "T"
case x >= 995000000000:
format, scale, suffix = "%.2f", 1e12, "T"
case x >= 99500000000:
format, scale, suffix = "%.0f", 1e9, "G"
case x >= 9950000000:
format, scale, suffix = "%.1f", 1e9, "G"
case x >= 995000000:
format, scale, suffix = "%.2f", 1e9, "G"
case x >= 99500000:
format, scale, suffix = "%.0f", 1e6, "M"
case x >= 9950000:
format, scale, suffix = "%.1f", 1e6, "M"
case x >= 995000:
format, scale, suffix = "%.2f", 1e6, "M"
case x >= 99500:
format, scale, suffix = "%.0f", 1e3, "k"
case x >= 9950:
format, scale, suffix = "%.1f", 1e3, "k"
case x >= 995:
format, scale, suffix = "%.2f", 1e3, "k"
case x >= 99.5:
format, scale, suffix = "%.0f", 1, ""
case x >= 9.95:
format, scale, suffix = "%.1f", 1, ""
default:
format, scale, suffix = "%.2f", 1, ""
}
if hasBaseUnit(unit, "B/op") || hasBaseUnit(unit, "bytes/op") || hasBaseUnit(unit, "bytes") {
suffix += "B"
}
if hasBaseUnit(unit, "MB/s") {
suffix += "B/s"
}
scale /= prescale
return func(val float64) string {
return fmt.Sprintf(format+suffix, val/scale)
}
}
func timeScaler(ns float64) Scaler {
var format string
var scale float64
switch x := ns / 1e9; {
case x >= 99.5:
format, scale = "%.0fs", 1
case x >= 9.95:
format, scale = "%.1fs", 1
case x >= 0.995:
format, scale = "%.2fs", 1
case x >= 0.0995:
format, scale = "%.0fms", 1000
case x >= 0.00995:
format, scale = "%.1fms", 1000
case x >= 0.000995:
format, scale = "%.2fms", 1000
case x >= 0.0000995:
format, scale = "%.0fµs", 1000*1000
case x >= 0.00000995:
format, scale = "%.1fµs", 1000*1000
case x >= 0.000000995:
format, scale = "%.2fµs", 1000*1000
case x >= 0.0000000995:
format, scale = "%.0fns", 1000*1000*1000
case x >= 0.00000000995:
format, scale = "%.1fns", 1000*1000*1000
default:
format, scale = "%.2fns", 1000*1000*1000
}
return func(ns float64) string {
return fmt.Sprintf(format, ns/1e9*scale)
}
}
// hasBaseUnit reports whether s has unit unit.
// For now, it reports whether s == unit or s ends in -unit.
func hasBaseUnit(s, unit string) bool {
return s == unit || strings.HasSuffix(s, "-"+unit)
}

37
pkg/obistats/sort.go Normal file
View File

@ -0,0 +1,37 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import (
"math"
"sort"
)
// An Order defines a sort order for a table.
// It reports whether t.Rows[i] should appear before t.Rows[j].
type Order func(t *Table, i, j int) bool
// ByName sorts tables by the Benchmark name column
func ByName(t *Table, i, j int) bool {
return t.Rows[i].Benchmark < t.Rows[j].Benchmark
}
// ByDelta sorts tables by the Delta column,
// reversing the order when larger is better (for "speed" results).
func ByDelta(t *Table, i, j int) bool {
return math.Abs(t.Rows[i].PctDelta)*float64(t.Rows[i].Change) <
math.Abs(t.Rows[j].PctDelta)*float64(t.Rows[j].Change)
}
// Reverse returns the reverse of the given order.
func Reverse(order Order) Order {
return func(t *Table, i, j int) bool { return order(t, j, i) }
}
// Sort sorts a Table t (in place) by the given order.
func Sort(t *Table, order Order) {
sort.SliceStable(t.Rows, func(i, j int) bool { return order(t, i, j) })
}

213
pkg/obistats/table.go Normal file
View File

@ -0,0 +1,213 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import (
"fmt"
"strings"
)
// A Table is a table for display in the benchstat output.
type Table struct {
Metric string
OldNewDelta bool // is this an old-new-delta table?
Configs []string
Groups []string
Rows []*Row
}
// A Row is a table row for display in the benchstat output.
type Row struct {
Benchmark string // benchmark name
Group string // group name
Scaler Scaler // formatter for stats means
Metrics []*Metrics // columns of statistics
PctDelta float64 // unformatted percent change
Delta string // formatted percent change
Note string // additional information
Change int // +1 better, -1 worse, 0 unchanged
}
// Tables returns tables comparing the benchmarks in the collection.
func (c *Collection) Tables() []*Table {
deltaTest := c.DeltaTest
if deltaTest == nil {
deltaTest = UTest
}
alpha := c.Alpha
if alpha == 0 {
alpha = 0.05
}
// Update statistics.
for _, m := range c.Metrics {
m.computeStats()
}
var tables []*Table
key := Key{}
for _, key.Unit = range c.Units {
table := new(Table)
table.Configs = c.Configs
table.Groups = c.Groups
table.Metric = metricOf(key.Unit)
table.OldNewDelta = len(c.Configs) == 2
for _, key.Group = range c.Groups {
for _, key.Benchmark = range c.Benchmarks[key.Group] {
row := &Row{Benchmark: key.Benchmark}
if len(c.Groups) > 1 {
// Show group headers if there is more than one group.
row.Group = key.Group
}
for _, key.Config = range c.Configs {
m := c.Metrics[key]
if m == nil {
row.Metrics = append(row.Metrics, new(Metrics))
continue
}
row.Metrics = append(row.Metrics, m)
if row.Scaler == nil {
row.Scaler = NewScaler(m.Mean, m.Unit)
}
}
// If there are only two configs being compared, add stats.
if table.OldNewDelta {
k0 := key
k0.Config = c.Configs[0]
k1 := key
k1.Config = c.Configs[1]
old := c.Metrics[k0]
new := c.Metrics[k1]
// If one is missing, omit row entirely.
// TODO: Control this better.
if old == nil || new == nil {
continue
}
pval, testerr := deltaTest(old, new)
row.PctDelta = 0.00
row.Delta = "~"
if testerr == ErrZeroVariance {
row.Note = "(zero variance)"
} else if testerr == ErrSampleSize {
row.Note = "(too few samples)"
} else if testerr == ErrSamplesEqual {
row.Note = "(all equal)"
} else if testerr != nil {
row.Note = fmt.Sprintf("(%s)", testerr)
} else if pval < alpha {
if new.Mean == old.Mean {
row.Delta = "0.00%"
} else {
pct := ((new.Mean / old.Mean) - 1.0) * 100.0
row.PctDelta = pct
row.Delta = fmt.Sprintf("%+.2f%%", pct)
if pct < 0 == (table.Metric != "speed") { // smaller is better, except speeds
row.Change = +1
} else {
row.Change = -1
}
}
}
if row.Note == "" && pval != -1 {
row.Note = fmt.Sprintf("(p=%0.3f n=%d+%d)", pval, len(old.RValues), len(new.RValues))
}
}
table.Rows = append(table.Rows, row)
}
}
if len(table.Rows) > 0 {
if c.Order != nil {
Sort(table, c.Order)
}
if c.AddGeoMean {
addGeomean(c, table, key.Unit, table.OldNewDelta)
}
tables = append(tables, table)
}
}
return tables
}
var metricSuffix = map[string]string{
"ns/op": "time/op",
"ns/GC": "time/GC",
"B/op": "alloc/op",
"MB/s": "speed",
}
// metricOf returns the name of the metric with the given unit.
func metricOf(unit string) string {
if s := metricSuffix[unit]; s != "" {
return s
}
for s, suff := range metricSuffix {
if dashs := "-" + s; strings.HasSuffix(unit, dashs) {
prefix := strings.TrimSuffix(unit, dashs)
return prefix + "-" + suff
}
}
return unit
}
// addGeomean adds a "geomean" row to the table,
// showing the geometric mean of all the benchmarks.
func addGeomean(c *Collection, t *Table, unit string, delta bool) {
row := &Row{Benchmark: "[Geo mean]"}
key := Key{Unit: unit}
geomeans := []float64{}
maxCount := 0
for _, key.Config = range c.Configs {
var means []float64
for _, key.Group = range c.Groups {
for _, key.Benchmark = range c.Benchmarks[key.Group] {
m := c.Metrics[key]
// Omit 0 values from the geomean calculation,
// as these either make the geomean undefined
// or zero (depending on who you ask). This
// typically comes up with things like
// allocation counts, where it's fine to just
// ignore the benchmark.
if m != nil && m.Mean != 0 {
means = append(means, m.Mean)
}
}
}
if len(means) > maxCount {
maxCount = len(means)
}
if len(means) == 0 {
row.Metrics = append(row.Metrics, new(Metrics))
delta = false
} else {
geomean := GeoMean(means)
geomeans = append(geomeans, geomean)
if row.Scaler == nil {
row.Scaler = NewScaler(geomean, unit)
}
row.Metrics = append(row.Metrics, &Metrics{
Unit: unit,
Mean: geomean,
})
}
}
if maxCount <= 1 {
// Only one benchmark contributed to this geomean.
// Since the geomean is the same as the benchmark
// result, don't bother outputting it.
return
}
if delta {
pct := ((geomeans[1] / geomeans[0]) - 1.0) * 100.0
row.PctDelta = pct
row.Delta = fmt.Sprintf("%+.2f%%", pct)
}
t.Rows = append(t.Rows, row)
}

37
pkg/obistats/tdist.go Normal file
View File

@ -0,0 +1,37 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import "math"
// A TDist is a Student's t-distribution with V degrees of freedom.
type TDist struct {
V float64
}
// PDF is the probability distribution function
func (t TDist) PDF(x float64) float64 {
return math.Exp(lgamma((t.V+1)/2)-lgamma(t.V/2)) /
math.Sqrt(t.V*math.Pi) * math.Pow(1+(x*x)/t.V, -(t.V+1)/2)
}
// CDF is the cumulative distribution function
func (t TDist) CDF(x float64) float64 {
if x == 0 {
return 0.5
} else if x > 0 {
return 1 - 0.5*mathBetaInc(t.V/(t.V+x*x), t.V/2, 0.5)
} else if x < 0 {
return 1 - t.CDF(-x)
} else {
return math.NaN()
}
}
// Bounds ...
func (t TDist) Bounds() (float64, float64) {
return -4, 4
}

143
pkg/obistats/ttest.go Normal file
View File

@ -0,0 +1,143 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import (
"math"
)
// A TTestResult is the result of a t-test.
type TTestResult struct {
// N1 and N2 are the sizes of the input samples. For a
// one-sample t-test, N2 is 0.
N1, N2 int
// T is the value of the t-statistic for this t-test.
T float64
// DoF is the degrees of freedom for this t-test.
DoF float64
// AltHypothesis specifies the alternative hypothesis tested
// by this test against the null hypothesis that there is no
// difference in the means of the samples.
AltHypothesis LocationHypothesis
// P is p-value for this t-test for the given null hypothesis.
P float64
}
func newTTestResult(n1, n2 int, t, dof float64, alt LocationHypothesis) *TTestResult {
dist := TDist{dof}
var p float64
switch alt {
case LocationDiffers:
p = 2 * (1 - dist.CDF(math.Abs(t)))
case LocationLess:
p = dist.CDF(t)
case LocationGreater:
p = 1 - dist.CDF(t)
}
return &TTestResult{N1: n1, N2: n2, T: t, DoF: dof, AltHypothesis: alt, P: p}
}
// A TTestSample is a sample that can be used for a one or two sample
// t-test.
type TTestSample interface {
Weight() float64
Mean() float64
Variance() float64
}
var ()
// TwoSampleTTest performs a two-sample (unpaired) Student's t-test on
// samples x1 and x2. This is a test of the null hypothesis that x1
// and x2 are drawn from populations with equal means. It assumes x1
// and x2 are independent samples, that the distributions have equal
// variance, and that the populations are normally distributed.
func TwoSampleTTest(x1, x2 TTestSample, alt LocationHypothesis) (*TTestResult, error) {
n1, n2 := x1.Weight(), x2.Weight()
if n1 == 0 || n2 == 0 {
return nil, ErrSampleSize
}
v1, v2 := x1.Variance(), x2.Variance()
if v1 == 0 && v2 == 0 {
return nil, ErrZeroVariance
}
dof := n1 + n2 - 2
v12 := ((n1-1)*v1 + (n2-1)*v2) / dof
t := (x1.Mean() - x2.Mean()) / math.Sqrt(v12*(1/n1+1/n2))
return newTTestResult(int(n1), int(n2), t, dof, alt), nil
}
// TwoSampleWelchTTest performs a two-sample (unpaired) Welch's t-test
// on samples x1 and x2. This is like TwoSampleTTest, but does not
// assume the distributions have equal variance.
func TwoSampleWelchTTest(x1, x2 TTestSample, alt LocationHypothesis) (*TTestResult, error) {
n1, n2 := x1.Weight(), x2.Weight()
if n1 <= 1 || n2 <= 1 {
// TODO: Can we still do this with n == 1?
return nil, ErrSampleSize
}
v1, v2 := x1.Variance(), x2.Variance()
if v1 == 0 && v2 == 0 {
return nil, ErrZeroVariance
}
dof := math.Pow(v1/n1+v2/n2, 2) /
(math.Pow(v1/n1, 2)/(n1-1) + math.Pow(v2/n2, 2)/(n2-1))
s := math.Sqrt(v1/n1 + v2/n2)
t := (x1.Mean() - x2.Mean()) / s
return newTTestResult(int(n1), int(n2), t, dof, alt), nil
}
// PairedTTest performs a two-sample paired t-test on samples x1 and
// x2. If μ0 is non-zero, this tests if the average of the difference
// is significantly different from μ0. If x1 and x2 are identical,
// this returns nil.
func PairedTTest(x1, x2 []float64, μ0 float64, alt LocationHypothesis) (*TTestResult, error) {
if len(x1) != len(x2) {
return nil, ErrMismatchedSamples
}
if len(x1) <= 1 {
// TODO: Can we still do this with n == 1?
return nil, ErrSampleSize
}
dof := float64(len(x1) - 1)
diff := make([]float64, len(x1))
for i := range x1 {
diff[i] = x1[i] - x2[i]
}
sd := StdDev(diff)
if sd == 0 {
// TODO: Can we still do the test?
return nil, ErrZeroVariance
}
t := (Mean(diff) - μ0) * math.Sqrt(float64(len(x1))) / sd
return newTTestResult(len(x1), len(x2), t, dof, alt), nil
}
// OneSampleTTest performs a one-sample t-test on sample x. This tests
// the null hypothesis that the population mean is equal to μ0. This
// assumes the distribution of the population of sample means is
// normal.
func OneSampleTTest(x TTestSample, μ0 float64, alt LocationHypothesis) (*TTestResult, error) {
n, v := x.Weight(), x.Variance()
if n == 0 {
return nil, ErrSampleSize
}
if v == 0 {
// TODO: Can we still do the test?
return nil, ErrZeroVariance
}
dof := n - 1
t := (x.Mean() - μ0) * math.Sqrt(n) / math.Sqrt(v)
return newTTestResult(int(n), 0, t, dof, alt), nil
}

386
pkg/obistats/udist.go Normal file
View File

@ -0,0 +1,386 @@
package obistats
//
// Dupplicated code from internal module available at :
// https://github.com/golang-design/bench.git
//
import "math"
// A UDist is the discrete probability distribution of the
// Mann-Whitney U statistic for a pair of samples of sizes N1 and N2.
//
// The details of computing this distribution with no ties can be
// found in Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of
// Whether one of Two Random Variables is Stochastically Larger than
// the Other". Annals of Mathematical Statistics 18 (1): 5060.
// Computing this distribution in the presence of ties is described in
// Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer".
// Journal of the American Statistical Association 61 (315): 772-787
// and Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney
// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7:
// 805-813 (the former paper contains details that are glossed over in
// the latter paper but has mathematical typesetting issues, so it's
// easiest to get the context from the former paper and the details
// from the latter).
type UDist struct {
N1, N2 int
// T is the count of the number of ties at each rank in the
// input distributions. T may be nil, in which case it is
// assumed there are no ties (which is equivalent to an M+N
// slice of 1s). It must be the case that Sum(T) == M+N.
T []int
}
// hasTies returns true if d has any tied samples.
func (d UDist) hasTies() bool {
for _, t := range d.T {
if t > 1 {
return true
}
}
return false
}
// p returns the p_{d.N1,d.N2} function defined by Mann, Whitney 1947
// for values of U from 0 up to and including the U argument.
//
// This algorithm runs in Θ(N1*N2*U) = O(N1²N2²) time and is quite
// fast for small values of N1 and N2. However, it does not handle ties.
func (d UDist) p(U int) []float64 {
// This is a dynamic programming implementation of the
// recursive recurrence definition given by Mann and Whitney:
//
// p_{n,m}(U) = (n * p_{n-1,m}(U-m) + m * p_{n,m-1}(U)) / (n+m)
// p_{n,m}(U) = 0 if U < 0
// p_{0,m}(U) = p{n,0}(U) = 1 / nCr(m+n, n) if U = 0
// = 0 if U > 0
//
// (Note that there is a typo in the original paper. The first
// recursive application of p should be for U-m, not U-M.)
//
// Since p{n,m} only depends on p{n-1,m} and p{n,m-1}, we only
// need to store one "plane" of the three dimensional space at
// a time.
//
// Furthermore, p_{n,m} = p_{m,n}, so we only construct values
// for n <= m and obtain the rest through symmetry.
//
// We organize the computed values of p as followed:
//
// n → N
// m *
// ↓ * *
// * * *
// * * * *
// * * * *
// M * * * *
//
// where each * is a slice indexed by U. The code below
// computes these left-to-right, top-to-bottom, so it only
// stores one row of this matrix at a time. Furthermore,
// computing an element in a given U slice only depends on the
// same and smaller values of U, so we can overwrite the U
// slice we're computing in place as long as we start with the
// largest value of U. Finally, even though the recurrence
// depends on (n,m) above the diagonal and we use symmetry to
// mirror those across the diagonal to (m,n), the mirrored
// indexes are always available in the current row, so this
// mirroring does not interfere with our ability to recycle
// state.
N, M := d.N1, d.N2
if N > M {
N, M = M, N
}
memo := make([][]float64, N+1)
for n := range memo {
memo[n] = make([]float64, U+1)
}
for m := 0; m <= M; m++ {
// Compute p_{0,m}. This is zero except for U=0.
memo[0][0] = 1
// Compute the remainder of this row.
nlim := N
if m < nlim {
nlim = m
}
for n := 1; n <= nlim; n++ {
lp := memo[n-1] // p_{n-1,m}
var rp []float64
if n <= m-1 {
rp = memo[n] // p_{n,m-1}
} else {
rp = memo[m-1] // p{m-1,n} and m==n
}
// For a given n,m, U is at most n*m.
//
// TODO: Actually, it's at most ⌈n*m/2⌉, but
// then we need to use more complex symmetries
// in the inner loop below.
ulim := n * m
if U < ulim {
ulim = U
}
out := memo[n] // p_{n,m}
nplusm := float64(n + m)
for U1 := ulim; U1 >= 0; U1-- {
l := 0.0
if U1-m >= 0 {
l = float64(n) * lp[U1-m]
}
r := float64(m) * rp[U1]
out[U1] = (l + r) / nplusm
}
}
}
return memo[N]
}
type ukey struct {
n1 int // size of first sample
twoU int // 2*U statistic for this permutation
}
// This computes the cumulative counts of the Mann-Whitney U
// distribution in the presence of ties using the computation from
// Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney
// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7:
// 805-813, with much guidance from appendix L of Klotz, A
// Computational Approach to Statistics.
//
// makeUmemo constructs a table memo[K][ukey{n1, 2*U}], where K is the
// number of ranks (up to len(t)), n1 is the size of the first sample
// (up to the n1 argument), and U is the U statistic (up to the
// argument twoU/2). The value of an entry in the memo table is the
// number of permutations of a sample of size n1 in a ranking with tie
// vector t[:K] having a U statistic <= U.
func makeUmemo(twoU, n1 int, t []int) []map[ukey]float64 {
// Another candidate for a fast implementation is van de Wiel,
// "The split-up algorithm: a fast symbolic method for
// computing p-values of distribution-free statistics". This
// is what's used by R's coin package. It's a comparatively
// recent publication, so it's presumably faster (or perhaps
// just more general) than previous techniques, but I can't
// get my hands on the paper.
//
// TODO: ~40% of this function's time is spent in mapassign on
// the assignment lines in the two loops and another ~20% in
// map access and iteration. Improving map behavior or
// replacing the maps altogether with some other constant-time
// structure could double performance.
//
// TODO: The worst case for this function is when there are
// few ties. Yet the best case overall is when there are *no*
// ties. Can we get the best of both worlds? Use the fast
// algorithm for the most part when there are few ties and mix
// in the general algorithm just where we need it? That's
// certainly possible for sub-problems where t[:k] has no
// ties, but that doesn't help if t[0] has a tie but nothing
// else does. Is it possible to rearrange the ranks without
// messing up our computation of the U statistic for
// sub-problems?
K := len(t)
// Compute a coefficients. The a slice is indexed by k (a[0]
// is unused).
a := make([]int, K+1)
a[1] = t[0]
for k := 2; k <= K; k++ {
a[k] = a[k-1] + t[k-2] + t[k-1]
}
// Create the memo table for the counts function, A. The A
// slice is indexed by k (A[0] is unused).
//
// In "The Mann Whitney Distribution Using Linked Lists", they
// use linked lists (*gasp*) for this, but within each K it's
// really just a memoization table, so it's faster to use a
// map. The outer structure is a slice indexed by k because we
// need to find all memo entries with certain values of k.
//
// TODO: The n1 and twoU values in the ukeys follow strict
// patterns. For each K value, the n1 values are every integer
// between two bounds. For each (K, n1) value, the twoU values
// are every integer multiple of a certain base between two
// bounds. It might be worth turning these into directly
// indexible slices.
A := make([]map[ukey]float64, K+1)
A[K] = map[ukey]float64{ukey{n1: n1, twoU: twoU}: 0}
// Compute memo table (k, n1, twoU) triples from high K values
// to low K values. This drives the recurrence relation
// downward to figure out all of the needed argument triples.
//
// TODO: Is it possible to generate this table bottom-up? If
// so, this could be a pure dynamic programming algorithm and
// we could discard the K dimension. We could at least store
// the inputs in a more compact representation that replaces
// the twoU dimension with an interval and a step size (as
// suggested by Cheung, Klotz, not that they make it at all
// clear *why* they're suggesting this).
tsum := sumint(t) // always ∑ t[0:k]
for k := K - 1; k >= 2; k-- {
tsum -= t[k]
A[k] = make(map[ukey]float64)
// Construct A[k] from A[k+1].
for A_kplus1 := range A[k+1] {
rkLow := maxint(0, A_kplus1.n1-tsum)
rkHigh := minint(A_kplus1.n1, t[k])
for rk := rkLow; rk <= rkHigh; rk++ {
twoU_k := A_kplus1.twoU - rk*(a[k+1]-2*A_kplus1.n1+rk)
n1_k := A_kplus1.n1 - rk
if twoUmin(n1_k, t[:k], a) <= twoU_k && twoU_k <= twoUmax(n1_k, t[:k], a) {
key := ukey{n1: n1_k, twoU: twoU_k}
A[k][key] = 0
}
}
}
}
// Fill counts in memo table from low K values to high K
// values. This unwinds the recurrence relation.
// Start with K==2 base case.
//
// TODO: Later computations depend on these, but these don't
// depend on anything (including each other), so if K==2, we
// can skip the memo table altogether.
if K < 2 {
panic("K < 2")
}
N_2 := t[0] + t[1]
for A_2i := range A[2] {
Asum := 0.0
r2Low := maxint(0, A_2i.n1-t[0])
r2High := (A_2i.twoU - A_2i.n1*(t[0]-A_2i.n1)) / N_2
for r2 := r2Low; r2 <= r2High; r2++ {
Asum += mathChoose(t[0], A_2i.n1-r2) *
mathChoose(t[1], r2)
}
A[2][A_2i] = Asum
}
// Derive counts for the rest of the memo table.
tsum = t[0] // always ∑ t[0:k-1]
for k := 3; k <= K; k++ {
tsum += t[k-2]
// Compute A[k] counts from A[k-1] counts.
for A_ki := range A[k] {
Asum := 0.0
rkLow := maxint(0, A_ki.n1-tsum)
rkHigh := minint(A_ki.n1, t[k-1])
for rk := rkLow; rk <= rkHigh; rk++ {
twoU_kminus1 := A_ki.twoU - rk*(a[k]-2*A_ki.n1+rk)
n1_kminus1 := A_ki.n1 - rk
x, ok := A[k-1][ukey{n1: n1_kminus1, twoU: twoU_kminus1}]
if !ok && twoUmax(n1_kminus1, t[:k-1], a) < twoU_kminus1 {
x = mathChoose(tsum, n1_kminus1)
}
Asum += x * mathChoose(t[k-1], rk)
}
A[k][A_ki] = Asum
}
}
return A
}
func twoUmin(n1 int, t, a []int) int {
K := len(t)
twoU := -n1 * n1
n1_k := n1
for k := 1; k <= K; k++ {
twoU_k := minint(n1_k, t[k-1])
twoU += twoU_k * a[k]
n1_k -= twoU_k
}
return twoU
}
func twoUmax(n1 int, t, a []int) int {
K := len(t)
twoU := -n1 * n1
n1_k := n1
for k := K; k > 0; k-- {
twoU_k := minint(n1_k, t[k-1])
twoU += twoU_k * a[k]
n1_k -= twoU_k
}
return twoU
}
func (d UDist) PMF(U float64) float64 {
if U < 0 || U >= 0.5+float64(d.N1*d.N2) {
return 0
}
if d.hasTies() {
// makeUmemo computes the CDF directly. Take its
// difference to get the PMF.
p1, ok1 := makeUmemo(int(2*U)-1, d.N1, d.T)[len(d.T)][ukey{d.N1, int(2*U) - 1}]
p2, ok2 := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}]
if !ok1 || !ok2 {
panic("makeUmemo did not return expected memoization table")
}
return (p2 - p1) / mathChoose(d.N1+d.N2, d.N1)
}
// There are no ties. Use the fast algorithm. U must be integral.
Ui := int(math.Floor(U))
// TODO: Use symmetry to minimize U
return d.p(Ui)[Ui]
}
func (d UDist) CDF(U float64) float64 {
if U < 0 {
return 0
} else if U >= float64(d.N1*d.N2) {
return 1
}
if d.hasTies() {
// TODO: Minimize U?
p, ok := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}]
if !ok {
panic("makeUmemo did not return expected memoization table")
}
return p / mathChoose(d.N1+d.N2, d.N1)
}
// There are no ties. Use the fast algorithm. U must be integral.
Ui := int(math.Floor(U))
// The distribution is symmetric around U = m * n / 2. Sum up
// whichever tail is smaller.
flip := Ui >= (d.N1*d.N2+1)/2
if flip {
Ui = d.N1*d.N2 - Ui - 1
}
pdfs := d.p(Ui)
p := 0.0
for _, pdf := range pdfs[:Ui+1] {
p += pdf
}
if flip {
p = 1 - p
}
return p
}
func (d UDist) Step() float64 {
return 0.5
}
func (d UDist) Bounds() (float64, float64) {
// TODO: More precise bounds when there are ties.
return 0, float64(d.N1 * d.N2)
}

View File

@ -1,6 +1,8 @@
package obicleandb
import (
"math/rand"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
@ -18,6 +20,114 @@ func SequenceTrust(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error
return obiseq.BioSequenceSlice{sequence}, nil
}
func MakeSequenceFamilyGenusWorker(references obiseq.BioSequenceSlice) obiseq.SeqWorker {
genus := make(map[int]*obiseq.BioSequenceSlice)
family := make(map[int]*obiseq.BioSequenceSlice)
for _, ref := range references {
g, ok := ref.GetIntAttribute("genus_taxid")
f, ok := ref.GetIntAttribute("family_taxid")
gs, ok := genus[g]
if !ok {
gs = obiseq.NewBioSequenceSlice(0)
genus[g] = gs
}
*gs = append(*gs, ref)
fs, ok := family[f]
if !ok {
fs = obiseq.NewBioSequenceSlice(0)
family[f] = fs
}
*fs = append(*fs, ref)
}
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
g, _ := sequence.GetIntAttribute("genus_taxid")
sequence.SetAttribute("obicleandb_level", "genus")
gs := genus[g]
indist := make([]float64, 0, gs.Len())
for _, s := range *gs {
if s != sequence {
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
indist = append(indist, float64(lali-lca))
}
}
nindist := len(indist)
pval := 0.0
f, _ := sequence.GetIntAttribute("family_taxid")
fs := family[f]
if nindist < 5 {
sequence.SetAttribute("obicleandb_level", "family")
for _, s := range *fs {
gf, _ := s.GetIntAttribute("genus_taxid")
if g != gf {
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
indist = append(indist, float64(lali-lca))
}
}
nindist = len(indist)
}
if nindist > 0 {
next := nindist
if next <= 20 {
next = 20
}
outdist := make([]float64, 0, nindist)
p := rand.Perm(references.Len())
i := 0
for _, ir := range p {
s := references[ir]
ff, _ := s.GetIntAttribute("family_taxid")
if ff != f {
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
outdist = append(outdist, float64(lali-lca))
i += 1
if i >= next {
break
}
}
}
res, err := obistats.MannWhitneyUTest(outdist, indist, obistats.LocationGreater)
if err == nil {
pval = res.P
}
level, _ := sequence.GetAttribute("obicleandb_level")
log.Warnf("%s - level: %v", sequence.Id(), level)
log.Warnf("%s - gdist: %v", sequence.Id(), indist)
log.Warnf("%s - fdist: %v", sequence.Id(), outdist)
log.Warnf("%s - pval: %f", sequence.Id(), pval)
} else {
sequence.SetAttribute("obicleandb_level", "none")
}
sequence.SetAttribute("obicleandb_trusted", pval)
return obiseq.BioSequenceSlice{sequence}, nil
}
return f
}
func diagCoord(x, y, n int) int {
if x > y {
x, y = y, x
@ -160,19 +270,26 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
obioptions.CLIParallelWorkers(),
)
genera_iterator, err := obichunk.ISequenceChunk(
annotated,
obiseq.AnnotationClassifier("genus_taxid", "NA"),
)
references := annotated.Load()
if err != nil {
log.Fatal(err)
}
mannwithney := MakeSequenceFamilyGenusWorker(references)
trusted := genera_iterator.MakeISliceWorker(
SequenceTrustSlice,
false,
)
partof := obiiter.IBatchOver(references,
obioptions.CLIBatchSize()).Speed("Testing belonging to genus")
return trusted
// genera_iterator, err := obichunk.ISequenceChunk(
// annotated,
// obiseq.AnnotationClassifier("genus_taxid", "NA"),
// )
// if err != nil {
// log.Fatal(err)
// }
// trusted := genera_iterator.MakeISliceWorker(
// SequenceTrustSlice,
// false,
// )
return partof.MakeIWorker(mannwithney, true)
}

View File

@ -0,0 +1,44 @@
package obidemerge
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
func MakeDemergeWorker(key string) obiseq.SeqWorker {
desc := obiseq.MakeStatsOnDescription(key)
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
if sequence.HasStatsOn(key) {
stats := sequence.StatsOn(desc, "NA")
sequence.DeleteAttribute(obiseq.StatsOnSlotName(key))
slice := obiseq.NewBioSequenceSlice(len(stats))
i := 0
for k, v := range stats {
(*slice)[i] = sequence.Copy()
(*slice)[i].SetAttribute(key, k)
(*slice)[i].SetCount(v)
i++
}
return *slice, nil
}
return obiseq.BioSequenceSlice{sequence}, nil
}
return obiseq.SeqWorker(f)
}
func CLIDemergeSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
if CLIHasSlotToDemerge() {
worker := MakeDemergeWorker(CLIDemergeSlot())
return iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers(), 0)
}
return iterator
}

View File

@ -0,0 +1,30 @@
package obidemerge
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _Demerge = ""
func DemergeOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_Demerge, "demerge", _Demerge,
options.Alias("d"),
options.Description("Indicates which slot has to be demerged."))
}
// OptionSet adds to the basic option set every options declared for
// the obipcr command
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
DemergeOptionSet(options)
}
func CLIDemergeSlot() string {
return _Demerge
}
func CLIHasSlotToDemerge() bool {
return _Demerge != ""
}

View File

@ -0,0 +1,132 @@
package obijoin
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
type IndexedSequenceSlice struct {
Sequences obiseq.BioSequenceSlice
Indices []map[interface{}]*obiutils.Set[int]
}
func (s IndexedSequenceSlice) Len() int {
return len(s.Sequences)
}
func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice {
var keeps obiutils.Set[int]
for i, v := range s.Indices {
if i == 0 {
keeps = *v[keys[0]]
} else {
keeps = keeps.Intersection(*v[keys[i]])
}
}
rep := obiseq.MakeBioSequenceSlice(len(keeps))
for i, v := range keeps.Members() {
rep[i] = s.Sequences[v]
}
return &rep
}
func BuildIndexedSequenceSlice(seqs obiseq.BioSequenceSlice, keys []string) IndexedSequenceSlice {
indices := make([]map[interface{}]*obiutils.Set[int], len(keys))
for i, k := range keys {
idx := make(map[interface{}]*obiutils.Set[int])
for j, seq := range seqs {
if value, ok := seq.GetAttribute(k); ok {
goods, ok := idx[value]
if !ok {
goods = obiutils.NewSet[int]()
idx[value] = goods
}
goods.Add(j)
}
}
indices[i] = idx
}
return IndexedSequenceSlice{seqs, indices}
}
func MakeJoinWorker(by []string, index IndexedSequenceSlice, updateId, updateSequence, updateQuality bool) obiseq.SeqWorker {
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
var ok bool
keys := make([]interface{}, len(by))
for i, v := range by {
keys[i], ok = sequence.GetAttribute(v)
if !ok {
return obiseq.BioSequenceSlice{sequence}, nil
}
}
join_with := index.Get(keys...)
rep := obiseq.MakeBioSequenceSlice(join_with.Len())
if join_with.Len() == 0 {
return obiseq.BioSequenceSlice{sequence}, nil
}
for i, v := range *join_with {
rep[i] = sequence.Copy()
annot := rep[i].Annotations()
new_annot := v.Annotations()
for k, v := range new_annot {
annot[k] = v
}
if updateId {
rep[i].SetId(v.Id())
}
if updateSequence && len(v.Sequence()) > 0 {
rep[i].SetSequence(v.Sequence())
}
if updateQuality && len(v.Qualities()) > 0 {
rep[i].SetQualities(v.Qualities())
}
}
return rep, nil
}
return obiseq.SeqWorker(f)
}
func CLIJoinSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
data_iter, err := obiformats.ReadSequencesFromFile(CLIJoinWith())
if err != nil {
log.Fatalf("Cannot read the data file to merge with: %s %v", CLIJoinWith(), err)
}
data := data_iter.Load()
keys := CLIBy()
index := BuildIndexedSequenceSlice(data, keys.Right)
worker := MakeJoinWorker(keys.Left, index, CLIUpdateId(), CLIUpdateSequence(), CLIUpdateQuality())
iterator = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
return iterator
}

View File

@ -0,0 +1,90 @@
package obijoin
import (
"strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _by = []string{}
var _join = ""
var _UpdateID = false
var _UpdateSequence = false
var _UpdateQuality = false
type By struct {
Left []string
Right []string
}
func JoinOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_by, "by", 1, 1,
options.Alias("b"),
options.Description("to declare join keys."))
options.StringVar(&_join, "join-with", _join,
options.Alias("j"),
options.Description("file name of the file to join with."),
options.Required("You must provide a file name to join with."))
options.BoolVar(&_UpdateID, "update-id", _UpdateID,
options.Alias("i"),
options.Description("Update the sequence IDs in the joined file."))
options.BoolVar(&_UpdateSequence, "update-sequence", _UpdateSequence,
options.Alias("s"),
options.Description("Update the sequence in the joined file."))
options.BoolVar(&_UpdateQuality, "update-quality", _UpdateQuality,
options.Alias("q"),
options.Description("Update the quality in the joined file."))
}
// OptionSet adds to the basic option set every options declared for
// the obipcr command
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
JoinOptionSet(options)
}
func CLIBy() By {
if len(_by) == 0 {
return By{
Left: []string{"id"},
Right: []string{"id"},
}
}
left := make([]string, len(_by))
right := make([]string, len(_by))
for i, v := range _by {
vals := strings.Split(v, "=")
left[i] = vals[0]
right[i] = vals[0]
if len(vals) > 1 {
right[i] = vals[1]
}
}
return By{Left: left, Right: right}
}
func CLIJoinWith() string {
return _join
}
func CLIUpdateId() bool {
return _UpdateID
}
func CLIUpdateSequence() bool {
return _UpdateSequence
}
func CLIUpdateQuality() bool {
return _UpdateQuality
}