New command: obi stats

2018-08-13 15:08:10 +02:00
parent 2dc7fcceac
commit b57e938cc4
1 changed files with 262 additions and 0 deletions
--- a/python/obitools3/commands/stats.pyx
+++ b/python/obitools3/commands/stats.pyx
@ -0,0 +1,262 @@
+#cython: language_level=3
+
+from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
+from obitools3.dms import DMS
+from obitools3.uri.decode import open_uri
+from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyInputOption
+from obitools3.dms.view import RollbackException
+from obitools3.apps.config import logger
+from obitools3.dms.capi.obiview cimport COUNT_COLUMN
+
+from functools import reduce
+import math
+import time
+ 
+
+__title__="Compute basic statistics for attribute values."
+
+'''
+`obi stats` computes basic statistics for attribute values of sequence records.
+The sequence records can be categorized or not using one or several ``-c`` options.
+By default, only the number of sequence records and the total count are computed for each category. 
+Additional statistics can be computed for attribute values in each category, such as:
+
+    - minimum value (``-m`` option) 
+    - maximum value (``-M`` option) 
+    - mean value (``-a`` option) 
+    - variance (``-v`` option) 
+    - standard deviation (``-s`` option)
+    
+The result is a contingency table with the different categories in rows, and the 
+computed statistics in columns. 
+'''
+
+
+# TODO: when is the taxonomy possibly used?
+
+ 
+def addOptions(parser):
+    
+    addMinimalInputOption(parser)
+    addTaxonomyInputOption(parser)
+
+    group=parser.add_argument_group('obi stats specific options')
+
+    group.add_argument('-c','--category-attribute',
+                       action="append", dest="stats:categories",
+                             metavar="<Attribute Name>",
+                             default=[],
+                             help="Attribute used to categorize the records.")
+    
+    group.add_argument('-m','--min',
+                       action="append", dest="stats:minimum",
+                       metavar="<Attribute Name>",
+                       default=[],
+                       help="Compute the minimum value of attribute for each category.")
+    
+    group.add_argument('-M','--max',
+                       action="append", dest="stats:maximum",
+                       metavar="<Attribute Name>",
+                       default=[],
+                       help="Compute the maximum value of attribute for each category.")
+
+    group.add_argument('-a','--mean',
+                       action="append", dest="stats:mean",
+                       metavar="<Attribute Name>",
+                       default=[],
+                       help="Compute the mean value of attribute for each category.")
+
+    group.add_argument('-v','--variance',
+                       action="append", dest="stats:var",
+                       metavar="<Attribute Name>",
+                       default=[],
+                       help="Compute the variance of attribute for each category.")
+
+    group.add_argument('-s','--std-dev',
+                       action="append", dest="stats:sd",
+                       metavar="<Attribute Name>",
+                       default=[],
+                       help="Compute the standard deviation of attribute for each category.")
+
+
+def statistics(values, attributes, func):
+    
+    stat={}
+    lstat={}
+    
+    for var in attributes:
+        if var in values:
+            stat[var]={}
+            lstat[var]=0
+            for c in values[var]:
+                v = values[var][c]
+                m = func(v)
+                stat[var][c]=m
+                lm=len(str(m))
+                if lm > lstat[var]:
+                    lstat[var]=lm
+                
+    return stat, lstat
+
+
+def minimum(values, options):
+    return statistics(values, options['minimum'], min)
+    
+
+def maximum(values, options):
+    return statistics(values, options['maximum'], max)
+
+
+def mean(values, options):
+    def average(v):
+        s = reduce(lambda x,y:x+y,v,0)
+        return float(s)/len(v)
+    return statistics(values, options['mean'], average)
+
+
+def variance(v):
+    if len(v)==1: 
+        return 0 
+    s = reduce(lambda x,y:(x[0]+y,x[1]+y**2),v,(0.,0.))
+    return s[1]/(len(v)-1) - s[0]**2/len(v)/(len(v)-1)
+
+
+def varpop(values, options):
+    return statistics(values, options['var'], variance)
+    
+
+def sd(values, options):
+    def stddev(v):
+        return math.sqrt(variance(v))
+    return statistics(values, options['sd'], stddev)
+
+ 
+def run(config):
+     
+    DMS.obi_atexit()
+    
+    logger("info", "obi stats")
+
+    # Open the input
+    input = open_uri(config['obi']['inputURI'])
+    if input is None:
+        raise Exception("Could not read input view")
+    i_view = input[1]
+
+    if 'taxoURI' in config['obi'] :   # TODO default None problem
+        taxo_uri = open_uri(config['obi']['taxoURI'])
+        if taxo_uri is None:
+            raise Exception("Couldn't open taxonomy")
+        taxo = taxo_uri[1]
+    else :
+        taxo = None
+
+    statistics = set(config['stats']['minimum']) | set(config['stats']['maximum']) | set(config['stats']['mean'])
+    total = 0
+    catcount={}
+    totcount={}
+    values={}
+    lcat=0
+    
+    # Initialize the progress bar
+    pb = ProgressBar(len(i_view), config, seconde=5)
+    
+    for i in range(len(i_view)):
+        pb(i)
+        line = i_view[i]
+        
+        category = []
+        for c in config['stats']['categories']:
+            try:
+                if taxo is not None:
+                    loc_env = {'sequence': line, 'line': line, 'taxonomy': taxo}
+                else:
+                    loc_env = {'sequence': line, 'line': line}
+
+                v = eval(c, loc_env, line)
+                
+                lv=len(str(v))
+                if lv > lcat:
+                    lcat=lv
+                category.append(v)
+            except:
+                category.append(None)
+                if 4 > lcat:
+                    lcat=4
+
+        category=tuple(category)
+        catcount[category]=catcount.get(category,0)+1
+        try: 
+            totcount[category]=totcount.get(category,0)+line[COUNT_COLUMN]
+        except KeyError:
+            totcount[category]=totcount.get(category,0)+1
+        for var in statistics:
+            if var in line:
+                v = line[var]
+                if var not in values:
+                    values[var]={}
+                if category not in values[var]:
+                    values[var][category]=[]
+                values[var][category].append(v)    
+
+    mini, lmini = minimum(values, config['stats'])
+    maxi, lmaxi = maximum(values, config['stats'])
+    avg, lavg = mean(values, config['stats'])
+    varp, lvarp = varpop(values, config['stats'])
+    sigma, lsigma = sd(values, config['stats'])
+
+    pcat = "%%-%ds" % lcat
+    if config['stats']['minimum']:
+        minvar= "min_%%-%ds" % max(len(x) for x in config['stats']['minimum'])
+    else:
+        minvar= "%s"
+        
+    if config['stats']['maximum']:
+        maxvar= "max_%%-%ds" % max(len(x) for x in config['stats']['maximum'])
+    else:
+        maxvar= "%s"
+        
+    if config['stats']['mean']:
+        meanvar= "mean_%%-%ds" % max(len(x) for x in config['stats']['mean'])
+    else:
+        meanvar= "%s"
+        
+    if config['stats']['var']:
+        varvar= "var_%%-%ds" % max(len(x) for x in config['stats']['var'])
+    else:
+        varvar= "%s"
+        
+    if config['stats']['sd']:
+        sdvar= "sd_%%-%ds" % max(len(x) for x in config['stats']['sd'])
+    else:
+        sdvar= "%s"
+        
+    hcat = "\t".join([pcat % x for x in config['stats']['categories']]) + "\t" +\
+           "\t".join([minvar % x for x in config['stats']['minimum']])  + "\t" +\
+           "\t".join([maxvar % x for x in config['stats']['maximum']])  + "\t" +\
+           "\t".join([meanvar % x for x in config['stats']['mean']])  + "\t" +\
+           "\t".join([varvar % x for x in config['stats']['var']])  + "\t" +\
+           "\t".join([sdvar % x for x in config['stats']['sd']]) + \
+           "\t   count" + \
+           "\t   total" 
+    print(hcat)
+    for c in catcount:
+        for v in c:
+            print(pcat % str(v)+"\t", end="")
+        for m in config['stats']['minimum']:
+            print((("%%%dd" % lmini[m]) % mini[m][c])+"\t", end="")
+        for m in config['stats']['maximum']:
+            print((("%%%dd" % lmaxi[m]) % maxi[m][c])+"\t", end="")
+        for m in config['stats']['mean']:
+            print((("%%%df" % lavg[m]) % avg[m][c])+"\t", end="")
+        for m in config['stats']['var']:
+            print((("%%%df" % lvarp[m]) % varp[m][c])+"\t", end="")
+        for m in config['stats']['sd']:
+            print((("%%%df" % lsigma[m]) % sigma[m][c])+"\t", end="")
+        print("%7d" %catcount[c], end="")
+        print("%9d" %totcount[c])
+
+    input[0].close()
+    
+    
+