obi uniq: added option to use categories additionally to the sequence to

determine uniqueness
2017-09-25 10:56:43 +02:00
parent 5ed6835e0e
commit 75c15594c4
2 changed files with 37 additions and 20 deletions
--- a/python/obitools3/commands/uniq.pxd
+++ b/python/obitools3/commands/uniq.pxd
@ -5,4 +5,4 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
-cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*)
+cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)
--- a/python/obitools3/commands/uniq.pyx
+++ b/python/obitools3/commands/uniq.pyx
@ -2,7 +2,6 @@
 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.dms.dms cimport DMS
 from obitools3.dms.taxo.taxo cimport Taxonomy
 from obitools3.dms.view.view cimport View, Line
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column, Column_line
@ -59,12 +58,12 @@ def addOptions(parser):
 # TODO taxonomy
-cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) :
+cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
    cdef int            i
    cdef int            o_idx
    cdef int            u_idx
-    cdef int            u_id
+    cdef tuple          u_id
    cdef int            i_count
    cdef set            mergedKeys
    cdef dict           uniques
@ -80,23 +79,26 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxo
    cdef Column         seq_col
    cdef object         to_merge
    cdef Column_line    mcol
-    cdef Column_line    i_mcol
+    cdef Column_line    i_mcol  
-        
+    cdef list           catl
    #print(categories)
    uniques = {}
    if categories is None:
        categories=[]
    if mergedKeys_list is not None:
        mergedKeys=set(mergedKeys_list)
    else:
        mergedKeys=set() 
-#    if taxonomy is not None:
+    if taxonomy is not None:
-#        mergedKeys.add('taxid')
+        mergedKeys.add('taxid')
    if categories is None:
        categories = []
    # Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
-    #logger("info", "obi uniq", "First browsing through the input")
+    logger("info", "First browsing through the input")
    merged_infos = {}
    i = 0
    iter_view = iter(view)
@ -131,20 +133,28 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxo
    del(merged_infos)
-    #logger("info", "obi uniq", "Second browsing through the input")
+    logger("info", "Second browsing through the input")
    i = 0
    o_idx = 0
    seq_col = view[NUC_SEQUENCE_COLUMN]
-        
+    
    iter_view = iter(view)
    for i_seq in iter_view :
        pb(i)
-        #u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)
+        # This can't be done in the same line as the u_id tuple creation because it generates a bug
-        u_id = seq_col.get_line_idx(i)
+        # where Cython (version 0.25.2) does not detect the reference to the categs_list variable and deallocates 
        # it at the beginning of the function.
        # (Only happens if categs_list is an optional parameter, which it is).
        catl = []
        for x in categories :
            catl.append(i_seq[x])    
        u_id = tuple(catl) + (seq_col.get_line_idx(i),)
        #u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)  # The line that cython can't read properly
        if u_id in uniques:
-                        
+            
            if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
                i_count = 1
            else:
@ -248,9 +258,11 @@ def run(config):
                      input=False,
                      newviewtype=View_NUC_SEQS)
    # TODO exceptions not handled like they should be
    entries = input[1]
    o_view = output[1]
-         
+    
    # Initialize the progress bar
    pb = ProgressBar(len(entries), config, seconde=5)
@ -259,9 +271,14 @@ def run(config):
 #        usm = uniqPrefixSequence
 #    else:
    usm = uniqSequence
    usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])       
 #     if 'taxoURI' in config['obi'] :   # TODO default None problem
 #         taxo = open_uri(config['obi']['taxoURI'])
 #     else :
    taxo = None
    usm(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])       
 #     if 'merge' in config['uniq'] :
 #         merged_keys=set(config['uniq']['merge'])
 #     else:
`@ -5,4 +5,4 @@ from obitools3.dms.taxo.taxo cimport Taxonomy`
	`from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS`	`from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS`


	`cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=, list mergedKeys_list=, bint mergeIds=, list categories=)`	`cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=, Taxonomy taxonomy=, bint mergeIds=, list categories=)`