New command: obi stats

This commit is contained in:
Celine Mercier
2018-08-13 15:08:10 +02:00
parent 2dc7fcceac
commit b57e938cc4

View File

@ -0,0 +1,262 @@
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms import DMS
from obitools3.uri.decode import open_uri
from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyInputOption
from obitools3.dms.view import RollbackException
from obitools3.apps.config import logger
from obitools3.dms.capi.obiview cimport COUNT_COLUMN
from functools import reduce
import math
import time
__title__="Compute basic statistics for attribute values."
'''
`obi stats` computes basic statistics for attribute values of sequence records.
The sequence records can be categorized or not using one or several ``-c`` options.
By default, only the number of sequence records and the total count are computed for each category.
Additional statistics can be computed for attribute values in each category, such as:
- minimum value (``-m`` option)
- maximum value (``-M`` option)
- mean value (``-a`` option)
- variance (``-v`` option)
- standard deviation (``-s`` option)
The result is a contingency table with the different categories in rows, and the
computed statistics in columns.
'''
# TODO: when is the taxonomy possibly used?
def addOptions(parser):
addMinimalInputOption(parser)
addTaxonomyInputOption(parser)
group=parser.add_argument_group('obi stats specific options')
group.add_argument('-c','--category-attribute',
action="append", dest="stats:categories",
metavar="<Attribute Name>",
default=[],
help="Attribute used to categorize the records.")
group.add_argument('-m','--min',
action="append", dest="stats:minimum",
metavar="<Attribute Name>",
default=[],
help="Compute the minimum value of attribute for each category.")
group.add_argument('-M','--max',
action="append", dest="stats:maximum",
metavar="<Attribute Name>",
default=[],
help="Compute the maximum value of attribute for each category.")
group.add_argument('-a','--mean',
action="append", dest="stats:mean",
metavar="<Attribute Name>",
default=[],
help="Compute the mean value of attribute for each category.")
group.add_argument('-v','--variance',
action="append", dest="stats:var",
metavar="<Attribute Name>",
default=[],
help="Compute the variance of attribute for each category.")
group.add_argument('-s','--std-dev',
action="append", dest="stats:sd",
metavar="<Attribute Name>",
default=[],
help="Compute the standard deviation of attribute for each category.")
def statistics(values, attributes, func):
stat={}
lstat={}
for var in attributes:
if var in values:
stat[var]={}
lstat[var]=0
for c in values[var]:
v = values[var][c]
m = func(v)
stat[var][c]=m
lm=len(str(m))
if lm > lstat[var]:
lstat[var]=lm
return stat, lstat
def minimum(values, options):
return statistics(values, options['minimum'], min)
def maximum(values, options):
return statistics(values, options['maximum'], max)
def mean(values, options):
def average(v):
s = reduce(lambda x,y:x+y,v,0)
return float(s)/len(v)
return statistics(values, options['mean'], average)
def variance(v):
if len(v)==1:
return 0
s = reduce(lambda x,y:(x[0]+y,x[1]+y**2),v,(0.,0.))
return s[1]/(len(v)-1) - s[0]**2/len(v)/(len(v)-1)
def varpop(values, options):
return statistics(values, options['var'], variance)
def sd(values, options):
def stddev(v):
return math.sqrt(variance(v))
return statistics(values, options['sd'], stddev)
def run(config):
DMS.obi_atexit()
logger("info", "obi stats")
# Open the input
input = open_uri(config['obi']['inputURI'])
if input is None:
raise Exception("Could not read input view")
i_view = input[1]
if 'taxoURI' in config['obi'] : # TODO default None problem
taxo_uri = open_uri(config['obi']['taxoURI'])
if taxo_uri is None:
raise Exception("Couldn't open taxonomy")
taxo = taxo_uri[1]
else :
taxo = None
statistics = set(config['stats']['minimum']) | set(config['stats']['maximum']) | set(config['stats']['mean'])
total = 0
catcount={}
totcount={}
values={}
lcat=0
# Initialize the progress bar
pb = ProgressBar(len(i_view), config, seconde=5)
for i in range(len(i_view)):
pb(i)
line = i_view[i]
category = []
for c in config['stats']['categories']:
try:
if taxo is not None:
loc_env = {'sequence': line, 'line': line, 'taxonomy': taxo}
else:
loc_env = {'sequence': line, 'line': line}
v = eval(c, loc_env, line)
lv=len(str(v))
if lv > lcat:
lcat=lv
category.append(v)
except:
category.append(None)
if 4 > lcat:
lcat=4
category=tuple(category)
catcount[category]=catcount.get(category,0)+1
try:
totcount[category]=totcount.get(category,0)+line[COUNT_COLUMN]
except KeyError:
totcount[category]=totcount.get(category,0)+1
for var in statistics:
if var in line:
v = line[var]
if var not in values:
values[var]={}
if category not in values[var]:
values[var][category]=[]
values[var][category].append(v)
mini, lmini = minimum(values, config['stats'])
maxi, lmaxi = maximum(values, config['stats'])
avg, lavg = mean(values, config['stats'])
varp, lvarp = varpop(values, config['stats'])
sigma, lsigma = sd(values, config['stats'])
pcat = "%%-%ds" % lcat
if config['stats']['minimum']:
minvar= "min_%%-%ds" % max(len(x) for x in config['stats']['minimum'])
else:
minvar= "%s"
if config['stats']['maximum']:
maxvar= "max_%%-%ds" % max(len(x) for x in config['stats']['maximum'])
else:
maxvar= "%s"
if config['stats']['mean']:
meanvar= "mean_%%-%ds" % max(len(x) for x in config['stats']['mean'])
else:
meanvar= "%s"
if config['stats']['var']:
varvar= "var_%%-%ds" % max(len(x) for x in config['stats']['var'])
else:
varvar= "%s"
if config['stats']['sd']:
sdvar= "sd_%%-%ds" % max(len(x) for x in config['stats']['sd'])
else:
sdvar= "%s"
hcat = "\t".join([pcat % x for x in config['stats']['categories']]) + "\t" +\
"\t".join([minvar % x for x in config['stats']['minimum']]) + "\t" +\
"\t".join([maxvar % x for x in config['stats']['maximum']]) + "\t" +\
"\t".join([meanvar % x for x in config['stats']['mean']]) + "\t" +\
"\t".join([varvar % x for x in config['stats']['var']]) + "\t" +\
"\t".join([sdvar % x for x in config['stats']['sd']]) + \
"\t count" + \
"\t total"
print(hcat)
for c in catcount:
for v in c:
print(pcat % str(v)+"\t", end="")
for m in config['stats']['minimum']:
print((("%%%dd" % lmini[m]) % mini[m][c])+"\t", end="")
for m in config['stats']['maximum']:
print((("%%%dd" % lmaxi[m]) % maxi[m][c])+"\t", end="")
for m in config['stats']['mean']:
print((("%%%df" % lavg[m]) % avg[m][c])+"\t", end="")
for m in config['stats']['var']:
print((("%%%df" % lvarp[m]) % varp[m][c])+"\t", end="")
for m in config['stats']['sd']:
print((("%%%df" % lsigma[m]) % sigma[m][c])+"\t", end="")
print("%7d" %catcount[c], end="")
print("%9d" %totcount[c])
input[0].close()