obi uniq: added the option to merge ids, except it only works on small

sets until lists are implemented properly using obiblobs
2017-09-25 17:28:03 +02:00
parent 75c15594c4
commit ae24a807da
1 changed files with 36 additions and 10 deletions
--- a/python/obitools3/commands/uniq.pyx
+++ b/python/obitools3/commands/uniq.pyx
@ -6,7 +6,7 @@ from obitools3.dms.view.view cimport View, Line
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column, Column_line
 from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
-from obitools3.dms.capi.obitypes cimport OBI_INT, index_t
+from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
 from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
 from obitools3.uri.decode import open_uri
 from obitools3.apps.config import logger
@ -36,7 +36,7 @@ def addOptions(parser):
    group.add_argument('--merge-ids', '-e',
                       action="store_true", dest="uniq:mergeids",
                       default=False,
-                       help="Add the merged key with all ids of merged sequences.")
+                       help="ONLY WORKING ON SMALL SETS FOR NOW Add the merged key with all ids of merged sequences.")
   
    group.add_argument('--category-attribute', '-c',
                        action="append", dest="uniq:categories",
@ -55,8 +55,6 @@ def addOptions(parser):
                             "sequences of any couple of groups are not the"
                             "prefix of the other one.")

-# TODO taxonomy
-

 cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
     
@ -81,7 +79,11 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
    cdef Column_line    mcol
    cdef Column_line    i_mcol  
    cdef list           catl
+    cdef dict           merged_ids_dict
    
+    cdef bytes k
+    cdef int n
+
    #print(categories)
    
    uniques = {}
@ -130,10 +132,11 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
                          comments=i_col.comments,
                          alias=merged_col_name     # TODO what if it already exists
                         )
-    
+        
    del(merged_infos)
    
    logger("info", "Second browsing through the input")
+    merged_ids_dict = {}
    i = 0
    o_idx = 0
    seq_col = view[NUC_SEQUENCE_COLUMN]
@ -199,8 +202,9 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
                if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] :
                    o_seq[key_b] = None
                                        
- #           if mergeIds:                           # TODO
- #               u_seq['merged'].append(i_seq.id)
+            if mergeIds :
+                merged_ids_dict[o_seq.id].append(i_seq.id)
+                #o_seq['merged'].append(i_seq.id)
            
        else:
            o_view[o_idx] = i_seq
@ -229,10 +233,32 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
                        mcol[to_merge] = o_seq[COUNT_COLUMN]
                    o_seq[key] = None   # TODO delete column eventually -> make C function?
                        
-#            if mergeIds:
-#                u_seq['merged']=[o_seq.id]
+            if mergeIds:
+                merged_ids_dict[o_seq.id] = [o_seq.id]  # TODO check that this id is added too in the original obiuniq
+                #o_seq['merged']=[o_seq.id]
+            
        i+=1 
-   
+    
+    # Merged ids column
+    if mergeIds :
+        nb_ids_max = 0
+        for k in merged_ids_dict :
+            n = len(merged_ids_dict[k])
+            if n > nb_ids_max :
+                nb_ids_max = n
+                
+        Column.new_column(o_view,
+                          "merged",
+                          OBI_STR,
+                          nb_elements_per_line=nb_ids_max,
+                          elements_names=None,
+                          comments="obi uniq merged ids",
+                          alias="merged"     # TODO what if it already exists
+                         )
+        
+        for o_seq in o_view:
+            o_seq['merged'] = merged_ids_dict[o_seq.id]
+
    #TODO
    #if taxonomy is not None:
    #    mergeTaxonomyClassification(uniqSeq, taxonomy)