From b2fc1f461196800a210357cf7774ecb434a6ff7d Mon Sep 17 00:00:00 2001
From: Celine Mercier <celine.mercier@metabarcoding.org>
Date: Thu, 27 Jul 2017 19:40:19 +0200
Subject: [PATCH] obi uniq: first version

---
 python/obitools3/commands/uniq.pyx | 297 +++++++++++++++++++++++++++++
 1 file changed, 297 insertions(+)
 create mode 100644 python/obitools3/commands/uniq.pyx
diff --git a/python/obitools3/commands/uniq.pyx b/python/obitools3/commands/uniq.pyx
new file mode 100644
index 0000000..82d1884
--- /dev/null
+++ b/python/obitools3/commands/uniq.pyx
@@ -0,0 +1,297 @@
+#cython: language_level=3
+
+from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
+from obitools3.dms.dms import DMS       # TODO cimport doesn't work
+from obitools3.dms.view.view import View    # TODO cimport doesn't work
+from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS
+from obitools3.dms.obiseq cimport Nuc_Seq
+from obitools3.dms.column.column cimport Column
+from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN
+from obitools3.dms.capi.obitypes cimport OBI_INT
+from obitools3.utils cimport tostr
+
+
+# TODO silence non-implemented options
+
+__title__="Groups records together"
+ 
+default_config = {   'inputview'    : None,
+                     'outputview'   : None
+                 }
+ 
+def addOptions(parser):
+ 
+    # TODO put this common group somewhere else but I don't know where
+    group=parser.add_argument_group('DMS and view options')
+ 
+    group.add_argument('--default-dms','-d', 
+                       action="store", dest="obi:defaultdms",
+                       metavar='<DMS NAME>',
+                       default=None,
+                       type=str,
+                       help="Name of the default DMS for reading and writing data.")
+ 
+    group.add_argument('--input-view','-i',
+                       action="store", dest="obi:inputview",
+                       metavar='<INPUT VIEW NAME>',
+                       default=None,
+                       type=str,
+                       help="Name of the input view, either raw if the view is in the default DMS,"
+                            " or in the form 'dms:view' if it is in another DMS.")
+ 
+    group.add_argument('--output-view','-o',
+                       action="store", dest="obi:outputview",
+                       metavar='<OUTPUT VIEW NAME>',
+                       default=None,
+                       type=str,
+                       help="Name of the output view, either raw if the view is in the default DMS,"
+                            " or in the form 'dms:view' if it is in another DMS.")
+    
+    group.add_argument('--taxo','-t',
+                       action="store", dest="obi:taxo",
+                       metavar='<TAXONOMY NAME>',
+                       default='',  # TODO not None because if it's None, the option is not entered in the option dictionary.
+                       type=str,
+                       help="Name of the taxonomy to use.")
+
+ 
+    group = parser.add_argument_group('obi uniq specific options')
+ 
+    group.add_argument('--merge', '-m',
+                       action="append", dest="uniq:merge",
+                       metavar="<TAG NAME>",
+                       default=[],
+                       type=str,
+                       help="Attributes to merge.")     # TODO must be a 1 elt/line column
+
+    group.add_argument('--merge-ids', '-e',
+                       action="store_true", dest="uniq:mergeids",
+                       default=False,
+                       help="Add the merged key with all ids of merged sequences.")
+   
+    group.add_argument('--category-attribute', '-c',
+                        action="append", dest="uniq:categories",
+                        metavar="<Attribute Name>",
+                        default=[],
+                        help="Add one attribute to the list of attributes "
+                             "used to group sequences before dereplication "
+                             "(option can be used several times).")
+
+    group.add_argument('--prefix', '-p',
+                        action="store_true", dest="uniq:prefix",
+                        default=False,
+                        help="Dereplication is done based on prefix matching: "
+                             "(i) The shortest sequence of each group is a prefix "
+                             "of any sequence of its group (ii) Two shortest "
+                             "sequences of any couple of groups are not the"
+                             "prefix of the other one.")
+
+# TODO taxonomy
+
+
+# TODO
+COUNT_COLUMN_str = tostr(COUNT_COLUMN)
+
+
+def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) :
+     
+    uniques = {}
+     
+    if categories is None:
+        categories=[]
+    
+    if mergedKey is not None:
+        mergedKey=set(mergedKey)
+    else:
+        mergedKey=set() 
+    
+    if taxonomy is not None:
+        mergedKey.add('taxid')
+    
+    # Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure
+    
+    o_idx = 0
+    i = 0
+    seq_col = view[NUC_SEQUENCE_COLUMN]
+
+    iter_view = iter(view)
+    for i_seq in iter_view :
+        pass
+    
+    iter_view = iter(view)
+    for i_seq in iter_view :
+        pb(i)
+        # utiliser l'index des AVLs, faire l'API
+        #u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),)
+        u_id = seq_col.get_line_idx(i)
+        if u_id in uniques:
+            u_seq = uniques[u_id]
+            o_seq = o_view[u_seq['idx']]
+            if COUNT_COLUMN_str in i_seq:
+                o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str]
+            else:
+                o_seq[COUNT_COLUMN_str] += 1
+#                seq['COUNT']=1
+#            if taxonomy is not None and 'taxid' in seq:
+#                s['merged_taxid'][seq['taxid']]=
+            for key in mergedKey:
+                if key=='taxid' and mergeIds:   # TODO
+                    if 'taxid_dist' in i_seq:
+                        u_seq["taxid_dist"].update(i_seq["taxid_dist"])
+                    if 'taxid' in i_seq:
+                        u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
+                        
+                mkey = "merged_%s" % key 
+                #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
+                if key in i_seq:
+                    u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str]
+                #cas ou merged_keys existe deja
+                else:
+                    if mkey in i_seq:
+                        for skey in i_seq[mkey]:
+                            u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey]
+                            
+            for key in i_seq.keys():
+                # Merger proprement l'attribut merged s'il existe
+                if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this
+                    o_seq[key] = None
+                            
+            if mergeIds:        
+                u_seq['merged'].append(i_seq.id)
+        
+        else:
+            
+            o_view[o_idx] = i_seq
+            o_seq = o_view[o_idx]
+            uniques[u_id] = {'idx':o_idx}
+            u_seq = uniques[u_id]
+            o_idx += 1
+
+            if COUNT_COLUMN_str not in o_seq:
+                o_seq[COUNT_COLUMN_str] = 1
+            
+            for key in mergedKey:
+                if key=='taxid' and mergeIds:
+                    if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
+                        u_seq["taxid_dist"] = {}
+                    else :
+                        u_seq["taxid_dist"] = o_seq["taxid_dist"]
+                    if 'taxid' in o_seq:
+                        u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
+                mkey = "merged_%s" % key 
+                if mkey not in o_seq:
+                    u_seq[mkey]={}
+                else :
+                    u_seq[mkey] = o_seq[mkey]
+                if key in o_seq:
+                    u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str]
+                    o_seq[key] = None
+
+            if mergeIds:
+                u_seq['merged']=[o_seq.id]
+         
+        i+=1 
+                
+    #TODO
+    #if taxonomy is not None:
+    #    mergeTaxonomyClassification(uniqSeq, taxonomy)
+
+    # Get informations to build the columns with merged attributes
+    merged_infos = {}
+    for u_id in uniques :
+        u_seq = uniques[u_id]
+        for mkey in u_seq :
+            if mkey != 'idx' :
+                mkey_dict = u_seq[mkey]
+                if mkey not in merged_infos :
+                    merged_infos[mkey] = {}
+                    mkey_infos = merged_infos[mkey]
+                    mkey_infos['nb_elts'] = len(mkey_dict.keys())
+                    mkey_infos['elt_names'] = [k for k in mkey_dict]
+                else :
+                    mkey_infos = merged_infos[mkey]
+                    for k in mkey_dict :
+                        if k not in mkey_infos['elt_names'] :
+                            mkey_infos['elt_names'].append(k)
+                            mkey_infos['nb_elts'] += 1
+    
+    keys_to_del = []
+    for k in merged_infos :
+        if merged_infos[k]['nb_elts'] == 0:
+            keys_to_del.append(k)
+    for k in keys_to_del :
+        del merged_infos[k]
+    
+    return (uniques, merged_infos)
+
+     
+def run(config):
+    
+    # TODO declare variables 
+    
+    # Open DMS
+    d = DMS.open(config['obi']['defaultdms'])
+     
+    # Open input view
+    entries = View.open(d, config['obi']['inputview'])
+     
+    # Initialize the progress bar
+    pb = ProgressBar(len(entries), config, seconde=5)
+
+# TODO
+#    if options.prefix:
+#        usm = uniqPrefixSequence
+#    else:
+    usm = uniqSequence
+    
+    # Create output view
+    view_class = View.get_view_class(entries.type)
+    if view_class == View_NUC_SEQS :
+        get_quality = tostr(QUALITY_COLUMN) in entries # TODO
+        o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality)
+    else :
+        o_view = view_class.new(d, config['obi']['outputview'])
+
+    (uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories'])       
+    
+    if 'merge' in config['uniq'] :
+        merged_keys=set(config['uniq']['merge'])
+    else:
+        merged_keys=set() 
+      
+    if 'taxo' in config['obi'] :
+        merged_keys.add('taxid')
+    
+    # TODO gotta handle special merged columns
+    for k in merged_keys:
+        merged_col_name = "merged_%s" % k
+        if merged_col_name in merged_infos :
+            i_col = entries[k]
+            Column.new_column(o_view, 
+                              merged_col_name, 
+                              OBI_INT, 
+                              nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
+                              elements_names=merged_infos[merged_col_name]['elt_names'],
+                              comments=i_col.comments,
+                              alias=merged_col_name
+                             )
+            
+    for u_id in uniques:
+        u_dict = uniques[u_id]
+        for merged_k in u_dict :
+            if merged_k in merged_infos :   # TODO don't enter irrelevant keys to begin with, instead
+                o_view[u_dict['idx']][merged_k] = u_dict[merged_k]
+             
+    print("\n")
+    print(repr(o_view))
+ 
+    d.close()
+     
+     
+     
+ 
+     
+     
+     
+     
+         
\ No newline at end of file