obi uniq: added taxonomy handling

2017-10-04 16:13:07 +02:00
parent 9fc6868341
commit 5ddd1d9ae6
2 changed files with 193 additions and 62 deletions
--- a/python/obitools3/commands/uniq.pxd
+++ b/python/obitools3/commands/uniq.pxd
@ -5,4 +5,5 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
-cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)
+cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy)
 cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)
--- a/python/obitools3/commands/uniq.pyx
+++ b/python/obitools3/commands/uniq.pyx
@ -7,13 +7,11 @@ from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column, Column_line
 from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
 from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
-from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
+from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption, addTaxonomyInputOption
 from obitools3.uri.decode import open_uri
 from obitools3.apps.config import logger
 # TODO silence non-implemented options
 __title__="Groups records together"
@ -22,7 +20,7 @@ def addOptions(parser):
    addSequenceInputOption(parser)
    addMinimalOutputOption(parser)
- 
+    addTaxonomyInputOption(parser)
    group = parser.add_argument_group('obi uniq specific options')
@ -46,17 +44,138 @@ def addOptions(parser):
                             "used to group sequences before dereplication "
                             "(option can be used several times).")
-    group.add_argument('--prefix', '-p',
+#     group.add_argument('--prefix', '-p',
-                        action="store_true", dest="uniq:prefix",
+#                         action="store_true", dest="uniq:prefix",
-                        default=False,
+#                         default=False,
-                        help="Dereplication is done based on prefix matching: "
+#                         help="Dereplication is done based on prefix matching: "
-                             "(i) The shortest sequence of each group is a prefix "
+#                              "(i) The shortest sequence of each group is a prefix "
-                             "of any sequence of its group (ii) Two shortest "
+#                              "of any sequence of its group (ii) Two shortest "
-                             "sequences of any couple of groups are not the"
+#                              "sequences of any couple of groups are not the"
-                             "prefix of the other one.")
+#                              "prefix of the other one.")
-cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
+cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
    cdef int    taxid
    cdef Line   seq
    cdef list   m_taxids
    cdef bytes  k
    cdef object tsp
    cdef object tgn
    cdef object tfa
    cdef object sp_sn
    cdef object gn_sn
    cdef object fa_sn
    # Create columns
    if 'species' in o_view and o_view['species'].data_type_int != OBI_INT :
        o_view.delete_column('species')
    if 'species' not in o_view:
        Column.new_column(o_view, 
                          'species', 
                          OBI_INT
                         )
    if 'genus' in o_view and o_view['genus'].data_type_int != OBI_INT :
        o_view.delete_column('genus')
    if 'genus' not in o_view:
        Column.new_column(o_view, 
                          'genus', 
                          OBI_INT
                         )
    if 'family' in o_view and o_view['family'].data_type_int != OBI_INT :
        o_view.delete_column('family')
    if 'family' not in o_view:
        Column.new_column(o_view, 
                          'family', 
                          OBI_INT
                         )
    if 'species_name' in o_view and o_view['species_name'].data_type_int != OBI_STR :
        o_view.delete_column('species_name')
    if 'species_name' not in o_view:
        Column.new_column(o_view, 
                          'species_name', 
                          OBI_STR
                         )
    if 'genus_name' in o_view and o_view['genus_name'].data_type_int != OBI_STR :
        o_view.delete_column('genus_name')
    if 'genus_name' not in o_view:
        Column.new_column(o_view, 
                          'genus_name', 
                          OBI_STR
                         )
    if 'family_name' in o_view and o_view['family_name'].data_type_int != OBI_STR :
        o_view.delete_column('family_name')
    if 'family_name' not in o_view:
        Column.new_column(o_view, 
                          'family_name', 
                          OBI_STR
                         )
    if 'rank' in o_view and o_view['rank'].data_type_int != OBI_STR :
        o_view.delete_column('rank')
    if 'rank' not in o_view:
        Column.new_column(o_view, 
                          'rank', 
                          OBI_STR
                         )
    if 'scientific_name' in o_view and o_view['scientific_name'].data_type_int != OBI_STR :
        o_view.delete_column('scientific_name')
    if 'scientific_name' not in o_view:
        Column.new_column(o_view, 
                          'scientific_name', 
                          OBI_STR
                         )
    for seq in o_view:
        if 'merged_taxid' in seq :
            m_taxids = []
            for k in seq['merged_taxid'].keys() :
                if seq['merged_taxid'][k] is not None:
                    m_taxids.append(int(k))
            taxid = taxonomy.last_common_taxon(*m_taxids)
            seq['taxid'] = taxid
            tsp = taxonomy.get_species(taxid)
            tgn = taxonomy.get_genus(taxid)
            tfa = taxonomy.get_family(taxid)
            if tsp is not None:
                sp_sn = taxonomy.get_scientific_name(tsp)
            else:
                sp_sn = None   # TODO was '###', discuss
                tsp = None     # TODO was '-1', discuss
            if tgn is not None:
                gn_sn = taxonomy.get_scientific_name(tgn)
            else:
                gn_sn = None
                tgn = None
            if tfa is not None:
                fa_sn = taxonomy.get_scientific_name(tfa)
            else:
                fa_sn = None
                tfa = None
            seq['species'] = tsp
            seq['genus'] = tgn
            seq['family'] = tfa
            seq['species_name'] = sp_sn
            seq['genus_name'] = gn_sn
            seq['family_name'] = fa_sn
            seq['rank'] = taxonomy.get_rank(taxid)
            seq['scientific_name'] = taxonomy.get_scientific_name(taxid)
 cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
    cdef int            i
    cdef int            o_idx
@ -80,14 +199,16 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
    cdef Column_line    i_mcol  
    cdef list           catl
    cdef dict           merged_ids_dict
-    
+    cdef int            nb_ids_max
-    cdef bytes k
+    cdef set            ids_list    
-    cdef int n
+    cdef bytes          k
    cdef int            n
    cdef dict           taxid_dist_dict
    #print(categories)
    uniques = {}
-    
+        
    if mergedKeys_list is not None:
        mergedKeys=set(mergedKeys_list)
    else:
@ -95,10 +216,10 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
    if taxonomy is not None:
        mergedKeys.add('taxid')
-    
+        
    if categories is None:
        categories = []
-    
+        
    # Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
    logger("info", "First browsing through the input")
    merged_infos = {}
@ -132,11 +253,12 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
                          comments=i_col.comments,
                          alias=merged_col_name     # TODO what if it already exists
                         )
-        
+         
    del(merged_infos)
    logger("info", "Second browsing through the input")
    merged_ids_dict = {}
    taxid_dist_dict = {}
    i = 0
    o_idx = 0
    seq_col = view[NUC_SEQUENCE_COLUMN]
@ -167,19 +289,17 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
            o_seq = o_view[u_idx]                    
            o_seq[COUNT_COLUMN] += i_count
 #            if taxonomy is not None and 'taxid' in seq:
 #                s['merged_taxid'][seq['taxid']]=
            for key in mergedKeys:
-#                if key=='taxid' and mergeIds:   # TODO
+                if key=='taxid' and mergeIds:
-#                    if 'taxid_dist' in i_seq:
+                    if 'taxid_dist' in i_seq:
-#                        u_seq["taxid_dist"].update(i_seq["taxid_dist"])
+                        taxid_dist_dict[o_seq.id].update(i_seq["taxid_dist"])
-#                    if 'taxid' in i_seq:
+                    if 'taxid' in i_seq:
-#                        u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
+                        taxid_dist_dict[o_seq.id][i_seq.id] = i_seq['taxid']
                mkey = "merged_%s" % key
                #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
                if key in i_seq:
-                    to_merge = i_seq[key]
+                    to_merge = str(i_seq[key])
                    mcol = o_seq[mkey]
                    if mcol[to_merge] is None:
                        mcol[to_merge] = i_count
@ -214,35 +334,36 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
            if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
                o_seq[COUNT_COLUMN] = 1
-                        
+            
            for key in mergedKeys:
-#                 if key=='taxid' and mergeIds:
+                if key=='taxid' and mergeIds:
-#                     if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
+                    if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
-#                         u_seq["taxid_dist"] = {}
+                        taxid_dist_dict[i_seq.id] = {}
-#                     else :
+                    if 'taxid' in o_seq:
-#                         u_seq["taxid_dist"] = o_seq["taxid_dist"]
+                        taxid_dist_dict[i_seq.id][o_seq.id] = o_seq['taxid']
 #                     if 'taxid' in o_seq:
 #                         u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
                mkey = "merged_%s" % key
                if key in o_seq:
-                    to_merge = o_seq[key]
+                    to_merge = str(o_seq[key])
                    mcol = o_seq[mkey]
                    if to_merge in mcol and mcol[to_merge] is not None:
                        mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
                    else:
                        mcol[to_merge] = o_seq[COUNT_COLUMN]
-                    o_seq[key] = None   # TODO delete column eventually -> make C function?
+                    o_seq[key] = None   # TODO delete column eventually -> make C function that deletes 'empty' columns
            if mergeIds:
-                merged_ids_dict[o_seq.id] = [o_seq.id]  # TODO check that this id is added too in the original obiuniq
+                merged_ids_dict[o_seq.id] = [o_seq.id]
                #o_seq['merged']=[o_seq.id]
        i+=1 
    # Merged ids column
    if mergeIds :
        logger("info", "Building merged ids column")
        nb_ids_max = 0
        ids_list = set()
        for k in merged_ids_dict :
            ids_list = set(list(ids_list) + merged_ids_dict[k])
            n = len(merged_ids_dict[k])
            if n > nb_ids_max :
                nb_ids_max = n
@ -259,9 +380,24 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
        for o_seq in o_view:
            o_seq['merged'] = merged_ids_dict[o_seq.id]
-    #TODO
+    if mergeIds and 'taxid' in mergedKeys:
-    #if taxonomy is not None:
+        logger("info", "Building taxid dist column")
-    #    mergeTaxonomyClassification(uniqSeq, taxonomy)
+        Column.new_column(o_view, 
                          "taxid_dist", 
                          OBI_INT, 
                          nb_elements_per_line=len(ids_list),
                          elements_names=list(ids_list),
                          comments="obi uniq taxid dist",
                          alias="taxid_dist"     # TODO what if it already exists
                         )
        for o_seq in o_view:
            if o_seq.id in taxid_dist_dict :
                o_seq['taxid_dist'] = taxid_dist_dict[o_seq.id]
    if taxonomy is not None:
        logger("info", "Merging taxonomy classification")
        merge_taxonomy_classification(o_view, taxonomy)
@ -269,6 +405,8 @@ def run(config):
    cdef tuple         input
    cdef tuple         output 
    cdef tuple         taxo_uri
    cdef Taxonomy      taxo
    cdef View_NUC_SEQS entries
    cdef View_NUC_SEQS o_view
    cdef ProgressBar   pb
@ -288,7 +426,12 @@ def run(config):
    entries = input[1]
    o_view = output[1]
-    
+    if 'taxoURI' in config['obi'] :   # TODO default None problem
        taxo_uri = open_uri(config['obi']['taxoURI'])
        taxo = taxo_uri[1]
    else :
        taxo = None
    # Initialize the progress bar
    pb = ProgressBar(len(entries), config, seconde=5)
@ -296,26 +439,13 @@ def run(config):
 #    if options.prefix:
 #        usm = uniqPrefixSequence
 #    else:
-    usm = uniqSequence
+    usm = uniq_sequences
-    
+        
 #     if 'taxoURI' in config['obi'] :   # TODO default None problem
 #         taxo = open_uri(config['obi']['taxoURI'])
 #     else :
    taxo = None
    usm(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])       
 #     if 'merge' in config['uniq'] :
 #         merged_keys=set(config['uniq']['merge'])
 #     else:
 #         merged_keys=set() 
 #       
 #     if 'taxo' in config['obi'] :
 #         merged_keys.add('taxid')
    print("\n")
    print(repr(o_view))
-    
+
-    input[0].close()
+    input[0].close()
-    output[0].close()     
+    output[0].close()