274 lines
8.9 KiB
Cython
Executable File
274 lines
8.9 KiB
Cython
Executable File
#cython: language_level=3
|
|
|
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
|
from obitools3.dms import DMS
|
|
from obitools3.uri.decode import open_uri
|
|
from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyOption
|
|
from obitools3.dms.view import RollbackException
|
|
from obitools3.apps.config import logger
|
|
from obitools3.dms.capi.obiview cimport COUNT_COLUMN
|
|
from obitools3.utils cimport tostr
|
|
|
|
from functools import reduce
|
|
import math
|
|
import time
|
|
import sys
|
|
from cpython.exc cimport PyErr_CheckSignals
|
|
|
|
|
|
__title__="Compute basic statistics for attribute values."
|
|
|
|
'''
|
|
`obi stats` computes basic statistics for attribute values of sequence records.
|
|
The sequence records can be categorized or not using one or several ``-c`` options.
|
|
By default, only the number of sequence records and the total count are computed for each category.
|
|
Additional statistics can be computed for attribute values in each category, such as:
|
|
|
|
- minimum value (``-m`` option)
|
|
- maximum value (``-M`` option)
|
|
- mean value (``-a`` option)
|
|
- variance (``-v`` option)
|
|
- standard deviation (``-s`` option)
|
|
|
|
The result is a contingency table with the different categories in rows, and the
|
|
computed statistics in columns.
|
|
'''
|
|
|
|
|
|
# TODO: when is the taxonomy possibly used?
|
|
|
|
|
|
def addOptions(parser):
|
|
|
|
addMinimalInputOption(parser)
|
|
addTaxonomyOption(parser)
|
|
|
|
group=parser.add_argument_group('obi stats specific options')
|
|
|
|
group.add_argument('-c','--category-attribute',
|
|
action="append", dest="stats:categories",
|
|
metavar="<Attribute Name>",
|
|
default=[],
|
|
help="Attribute used to categorize the records.")
|
|
|
|
group.add_argument('-m','--min',
|
|
action="append", dest="stats:minimum",
|
|
metavar="<Attribute Name>",
|
|
default=[],
|
|
help="Compute the minimum value of attribute for each category.")
|
|
|
|
group.add_argument('-M','--max',
|
|
action="append", dest="stats:maximum",
|
|
metavar="<Attribute Name>",
|
|
default=[],
|
|
help="Compute the maximum value of attribute for each category.")
|
|
|
|
group.add_argument('-a','--mean',
|
|
action="append", dest="stats:mean",
|
|
metavar="<Attribute Name>",
|
|
default=[],
|
|
help="Compute the mean value of attribute for each category.")
|
|
|
|
group.add_argument('-v','--variance',
|
|
action="append", dest="stats:var",
|
|
metavar="<Attribute Name>",
|
|
default=[],
|
|
help="Compute the variance of attribute for each category.")
|
|
|
|
group.add_argument('-s','--std-dev',
|
|
action="append", dest="stats:sd",
|
|
metavar="<Attribute Name>",
|
|
default=[],
|
|
help="Compute the standard deviation of attribute for each category.")
|
|
|
|
|
|
def statistics(values, attributes, func):
|
|
|
|
stat={}
|
|
lstat={}
|
|
|
|
for var in attributes:
|
|
if var in values:
|
|
stat[var]={}
|
|
lstat[var]=0
|
|
for c in values[var]:
|
|
v = values[var][c]
|
|
m = func(v)
|
|
stat[var][c]=m
|
|
lm=len(str(m))
|
|
if lm > lstat[var]:
|
|
lstat[var]=lm
|
|
|
|
return stat, lstat
|
|
|
|
|
|
def minimum(values, options):
|
|
return statistics(values, options['minimum'], min)
|
|
|
|
|
|
def maximum(values, options):
|
|
return statistics(values, options['maximum'], max)
|
|
|
|
|
|
def mean(values, options):
|
|
def average(v):
|
|
s = reduce(lambda x,y:x+y,v,0)
|
|
return float(s)/len(v)
|
|
return statistics(values, options['mean'], average)
|
|
|
|
|
|
def variance(v):
|
|
if len(v)==1:
|
|
return 0
|
|
s = reduce(lambda x,y:(x[0]+y,x[1]+y**2),v,(0.,0.))
|
|
return s[1]/(len(v)-1) - s[0]**2/len(v)/(len(v)-1)
|
|
|
|
|
|
def varpop(values, options):
|
|
return statistics(values, options['var'], variance)
|
|
|
|
|
|
def sd(values, options):
|
|
def stddev(v):
|
|
return math.sqrt(variance(v))
|
|
return statistics(values, options['sd'], stddev)
|
|
|
|
|
|
def run(config):
|
|
|
|
DMS.obi_atexit()
|
|
|
|
logger("info", "obi stats")
|
|
|
|
# Open the input
|
|
input = open_uri(config['obi']['inputURI'])
|
|
if input is None:
|
|
raise Exception("Could not read input view")
|
|
i_view = input[1]
|
|
|
|
if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None:
|
|
taxo_uri = open_uri(config['obi']['taxoURI'])
|
|
if taxo_uri is None or taxo_uri[2] == bytes:
|
|
raise Exception("Couldn't open taxonomy")
|
|
taxo = taxo_uri[1]
|
|
else :
|
|
taxo = None
|
|
|
|
statistics = set(config['stats']['minimum']) | set(config['stats']['maximum']) | set(config['stats']['mean'])
|
|
total = 0
|
|
catcount={}
|
|
totcount={}
|
|
values={}
|
|
lcat=0
|
|
|
|
# Initialize the progress bar
|
|
pb = ProgressBar(len(i_view), config, seconde=5)
|
|
|
|
for i in range(len(i_view)):
|
|
PyErr_CheckSignals()
|
|
pb(i)
|
|
line = i_view[i]
|
|
|
|
category = []
|
|
for c in config['stats']['categories']:
|
|
try:
|
|
if taxo is not None:
|
|
loc_env = {'sequence': line, 'line': line, 'taxonomy': taxo}
|
|
else:
|
|
loc_env = {'sequence': line, 'line': line}
|
|
|
|
v = eval(c, loc_env, line)
|
|
|
|
lv=len(str(v))
|
|
if lv > lcat:
|
|
lcat=lv
|
|
category.append(v)
|
|
except:
|
|
category.append(None)
|
|
if 4 > lcat:
|
|
lcat=4
|
|
|
|
category=tuple(category)
|
|
catcount[category]=catcount.get(category,0)+1
|
|
try:
|
|
totcount[category]=totcount.get(category,0)+line[COUNT_COLUMN]
|
|
except KeyError:
|
|
totcount[category]=totcount.get(category,0)+1
|
|
for var in statistics:
|
|
if var in line:
|
|
v = line[var]
|
|
if var not in values:
|
|
values[var]={}
|
|
if category not in values[var]:
|
|
values[var][category]=[]
|
|
values[var][category].append(v)
|
|
|
|
pb(i, force=True)
|
|
print("", file=sys.stderr)
|
|
|
|
mini, lmini = minimum(values, config['stats'])
|
|
maxi, lmaxi = maximum(values, config['stats'])
|
|
avg, lavg = mean(values, config['stats'])
|
|
varp, lvarp = varpop(values, config['stats'])
|
|
sigma, lsigma = sd(values, config['stats'])
|
|
|
|
pcat = "%%-%ds" % lcat
|
|
if config['stats']['minimum']:
|
|
minvar= "min_%%-%ds" % max(len(x) for x in config['stats']['minimum'])
|
|
else:
|
|
minvar= "%s"
|
|
|
|
if config['stats']['maximum']:
|
|
maxvar= "max_%%-%ds" % max(len(x) for x in config['stats']['maximum'])
|
|
else:
|
|
maxvar= "%s"
|
|
|
|
if config['stats']['mean']:
|
|
meanvar= "mean_%%-%ds" % max(len(x) for x in config['stats']['mean'])
|
|
else:
|
|
meanvar= "%s"
|
|
|
|
if config['stats']['var']:
|
|
varvar= "var_%%-%ds" % max(len(x) for x in config['stats']['var'])
|
|
else:
|
|
varvar= "%s"
|
|
|
|
if config['stats']['sd']:
|
|
sdvar= "sd_%%-%ds" % max(len(x) for x in config['stats']['sd'])
|
|
else:
|
|
sdvar= "%s"
|
|
|
|
hcat = "\t".join([pcat % x for x in config['stats']['categories']]) + "\t" +\
|
|
"\t".join([minvar % x for x in config['stats']['minimum']]) + "\t" +\
|
|
"\t".join([maxvar % x for x in config['stats']['maximum']]) + "\t" +\
|
|
"\t".join([meanvar % x for x in config['stats']['mean']]) + "\t" +\
|
|
"\t".join([varvar % x for x in config['stats']['var']]) + "\t" +\
|
|
"\t".join([sdvar % x for x in config['stats']['sd']]) + \
|
|
"\t count" + \
|
|
"\t total"
|
|
print(hcat)
|
|
sorted_stats = sorted(catcount.items(), key = lambda kv:(totcount[kv[0]]), reverse=True)
|
|
for i in range(len(sorted_stats)):
|
|
c = sorted_stats[i][0]
|
|
for v in c:
|
|
if type(v) == bytes:
|
|
print(pcat % tostr(v)+"\t", end="")
|
|
else:
|
|
print(pcat % str(v)+"\t", end="")
|
|
for m in config['stats']['minimum']:
|
|
print((("%%%dd" % lmini[m]) % mini[m][c])+"\t", end="")
|
|
for m in config['stats']['maximum']:
|
|
print((("%%%dd" % lmaxi[m]) % maxi[m][c])+"\t", end="")
|
|
for m in config['stats']['mean']:
|
|
print((("%%%df" % lavg[m]) % avg[m][c])+"\t", end="")
|
|
for m in config['stats']['var']:
|
|
print((("%%%df" % lvarp[m]) % varp[m][c])+"\t", end="")
|
|
for m in config['stats']['sd']:
|
|
print((("%%%df" % lsigma[m]) % sigma[m][c])+"\t", end="")
|
|
print("%7d" %catcount[c], end="")
|
|
print("%9d" %totcount[c])
|
|
|
|
input[0].close(force=True)
|
|
|
|
logger("info", "Done.")
|