obi uniq: added taxonomy handling

2017-10-04 16:13:07 +02:00
parent 9fc6868341
commit 5ddd1d9ae6
2 changed files with 193 additions and 62 deletions
--- a/python/obitools3/commands/uniq.pxd
+++ b/python/obitools3/commands/uniq.pxd
@ -5,4 +5,5 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS


-cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)
+cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy)
+cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)
--- a/python/obitools3/commands/uniq.pyx
+++ b/python/obitools3/commands/uniq.pyx
@ -7,13 +7,11 @@ from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column, Column_line
 from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
 from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
-from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
+from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption, addTaxonomyInputOption
 from obitools3.uri.decode import open_uri
 from obitools3.apps.config import logger


-# TODO silence non-implemented options
-
 __title__="Groups records together"
 

@ -22,7 +20,7 @@ def addOptions(parser):
 
    addSequenceInputOption(parser)
    addMinimalOutputOption(parser)
- 
+    addTaxonomyInputOption(parser)
 
    group = parser.add_argument_group('obi uniq specific options')
 
@ -46,17 +44,138 @@ def addOptions(parser):
                             "used to group sequences before dereplication "
                             "(option can be used several times).")

-    group.add_argument('--prefix', '-p',
-                        action="store_true", dest="uniq:prefix",
-                        default=False,
-                        help="Dereplication is done based on prefix matching: "
-                             "(i) The shortest sequence of each group is a prefix "
-                             "of any sequence of its group (ii) Two shortest "
-                             "sequences of any couple of groups are not the"
-                             "prefix of the other one.")
+#     group.add_argument('--prefix', '-p',
+#                         action="store_true", dest="uniq:prefix",
+#                         default=False,
+#                         help="Dereplication is done based on prefix matching: "
+#                              "(i) The shortest sequence of each group is a prefix "
+#                              "of any sequence of its group (ii) Two shortest "
+#                              "sequences of any couple of groups are not the"
+#                              "prefix of the other one.")


-cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
+cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
+    
+    cdef int    taxid
+    cdef Line   seq
+    cdef list   m_taxids
+    cdef bytes  k
+    cdef object tsp
+    cdef object tgn
+    cdef object tfa
+    cdef object sp_sn
+    cdef object gn_sn
+    cdef object fa_sn
+
+    # Create columns
+    if 'species' in o_view and o_view['species'].data_type_int != OBI_INT :
+        o_view.delete_column('species')
+    if 'species' not in o_view:
+        Column.new_column(o_view, 
+                          'species', 
+                          OBI_INT
+                         )
+
+    if 'genus' in o_view and o_view['genus'].data_type_int != OBI_INT :
+        o_view.delete_column('genus')
+    if 'genus' not in o_view:
+        Column.new_column(o_view, 
+                          'genus', 
+                          OBI_INT
+                         )
+
+    if 'family' in o_view and o_view['family'].data_type_int != OBI_INT :
+        o_view.delete_column('family')
+    if 'family' not in o_view:
+        Column.new_column(o_view, 
+                          'family', 
+                          OBI_INT
+                         )
+
+    if 'species_name' in o_view and o_view['species_name'].data_type_int != OBI_STR :
+        o_view.delete_column('species_name')
+    if 'species_name' not in o_view:
+        Column.new_column(o_view, 
+                          'species_name', 
+                          OBI_STR
+                         )
+
+    if 'genus_name' in o_view and o_view['genus_name'].data_type_int != OBI_STR :
+        o_view.delete_column('genus_name')
+    if 'genus_name' not in o_view:
+        Column.new_column(o_view, 
+                          'genus_name', 
+                          OBI_STR
+                         )
+
+    if 'family_name' in o_view and o_view['family_name'].data_type_int != OBI_STR :
+        o_view.delete_column('family_name')
+    if 'family_name' not in o_view:
+        Column.new_column(o_view, 
+                          'family_name', 
+                          OBI_STR
+                         )
+
+    if 'rank' in o_view and o_view['rank'].data_type_int != OBI_STR :
+        o_view.delete_column('rank')
+    if 'rank' not in o_view:
+        Column.new_column(o_view, 
+                          'rank', 
+                          OBI_STR
+                         )
+
+    if 'scientific_name' in o_view and o_view['scientific_name'].data_type_int != OBI_STR :
+        o_view.delete_column('scientific_name')
+    if 'scientific_name' not in o_view:
+        Column.new_column(o_view, 
+                          'scientific_name', 
+                          OBI_STR
+                         )
+
+    for seq in o_view:
+        
+        if 'merged_taxid' in seq :
+            m_taxids = []
+            for k in seq['merged_taxid'].keys() :
+                if seq['merged_taxid'][k] is not None:
+                    m_taxids.append(int(k))
+            taxid = taxonomy.last_common_taxon(*m_taxids)
+            seq['taxid'] = taxid
+            tsp = taxonomy.get_species(taxid)
+            tgn = taxonomy.get_genus(taxid)
+            tfa = taxonomy.get_family(taxid)
+            
+            if tsp is not None:
+                sp_sn = taxonomy.get_scientific_name(tsp)
+            else:
+                sp_sn = None   # TODO was '###', discuss
+                tsp = None     # TODO was '-1', discuss
+                
+            if tgn is not None:
+                gn_sn = taxonomy.get_scientific_name(tgn)
+            else:
+                gn_sn = None
+                tgn = None
+                
+            if tfa is not None:
+                fa_sn = taxonomy.get_scientific_name(tfa)
+            else:
+                fa_sn = None
+                tfa = None
+                
+            seq['species'] = tsp
+            seq['genus'] = tgn
+            seq['family'] = tfa
+                
+            seq['species_name'] = sp_sn
+            seq['genus_name'] = gn_sn
+            seq['family_name'] = fa_sn
+            
+            seq['rank'] = taxonomy.get_rank(taxid)
+            seq['scientific_name'] = taxonomy.get_scientific_name(taxid)
+
+
+cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
     
    cdef int            i
    cdef int            o_idx
@ -80,9 +199,11 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
    cdef Column_line    i_mcol  
    cdef list           catl
    cdef dict           merged_ids_dict
-    
+    cdef int            nb_ids_max
+    cdef set            ids_list    
    cdef bytes          k
    cdef int            n
+    cdef dict           taxid_dist_dict

    #print(categories)
    
@ -137,6 +258,7 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
    
    logger("info", "Second browsing through the input")
    merged_ids_dict = {}
+    taxid_dist_dict = {}
    i = 0
    o_idx = 0
    seq_col = view[NUC_SEQUENCE_COLUMN]
@ -167,19 +289,17 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
            o_seq = o_view[u_idx]                    
            o_seq[COUNT_COLUMN] += i_count
                        
-#            if taxonomy is not None and 'taxid' in seq:
-#                s['merged_taxid'][seq['taxid']]=
            for key in mergedKeys:
-#                if key=='taxid' and mergeIds:   # TODO
-#                    if 'taxid_dist' in i_seq:
-#                        u_seq["taxid_dist"].update(i_seq["taxid_dist"])
-#                    if 'taxid' in i_seq:
-#                        u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
+                if key=='taxid' and mergeIds:
+                    if 'taxid_dist' in i_seq:
+                        taxid_dist_dict[o_seq.id].update(i_seq["taxid_dist"])
+                    if 'taxid' in i_seq:
+                        taxid_dist_dict[o_seq.id][i_seq.id] = i_seq['taxid']
                
                mkey = "merged_%s" % key
                #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
                if key in i_seq:
-                    to_merge = i_seq[key]
+                    to_merge = str(i_seq[key])
                    mcol = o_seq[mkey]
                    if mcol[to_merge] is None:
                        mcol[to_merge] = i_count
@ -216,33 +336,34 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
                o_seq[COUNT_COLUMN] = 1
            
            for key in mergedKeys:
-#                 if key=='taxid' and mergeIds:
-#                     if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
-#                         u_seq["taxid_dist"] = {}
-#                     else :
-#                         u_seq["taxid_dist"] = o_seq["taxid_dist"]
-#                     if 'taxid' in o_seq:
-#                         u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
+                if key=='taxid' and mergeIds:
+                    if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
+                        taxid_dist_dict[i_seq.id] = {}
+                    if 'taxid' in o_seq:
+                        taxid_dist_dict[i_seq.id][o_seq.id] = o_seq['taxid']
                mkey = "merged_%s" % key
                if key in o_seq:
-                    to_merge = o_seq[key]
+                    to_merge = str(o_seq[key])
                    mcol = o_seq[mkey]
                    if to_merge in mcol and mcol[to_merge] is not None:
                        mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
                    else:
                        mcol[to_merge] = o_seq[COUNT_COLUMN]
-                    o_seq[key] = None   # TODO delete column eventually -> make C function?
+                    o_seq[key] = None   # TODO delete column eventually -> make C function that deletes 'empty' columns
                        
            if mergeIds:
-                merged_ids_dict[o_seq.id] = [o_seq.id]  # TODO check that this id is added too in the original obiuniq
+                merged_ids_dict[o_seq.id] = [o_seq.id]
                #o_seq['merged']=[o_seq.id]
            
        i+=1 
    
    # Merged ids column
    if mergeIds :
+        logger("info", "Building merged ids column")
        nb_ids_max = 0
+        ids_list = set()
        for k in merged_ids_dict :
+            ids_list = set(list(ids_list) + merged_ids_dict[k])
            n = len(merged_ids_dict[k])
            if n > nb_ids_max :
                nb_ids_max = n
@ -259,9 +380,24 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
        for o_seq in o_view:
            o_seq['merged'] = merged_ids_dict[o_seq.id]

-    #TODO
-    #if taxonomy is not None:
-    #    mergeTaxonomyClassification(uniqSeq, taxonomy)
+    if mergeIds and 'taxid' in mergedKeys:
+        logger("info", "Building taxid dist column")
+        Column.new_column(o_view, 
+                          "taxid_dist", 
+                          OBI_INT, 
+                          nb_elements_per_line=len(ids_list),
+                          elements_names=list(ids_list),
+                          comments="obi uniq taxid dist",
+                          alias="taxid_dist"     # TODO what if it already exists
+                         )
+                
+        for o_seq in o_view:
+            if o_seq.id in taxid_dist_dict :
+                o_seq['taxid_dist'] = taxid_dist_dict[o_seq.id]
+
+    if taxonomy is not None:
+        logger("info", "Merging taxonomy classification")
+        merge_taxonomy_classification(o_view, taxonomy)


     
@ -269,6 +405,8 @@ def run(config):
    
    cdef tuple         input
    cdef tuple         output 
+    cdef tuple         taxo_uri
+    cdef Taxonomy      taxo
    cdef View_NUC_SEQS entries
    cdef View_NUC_SEQS o_view
    cdef ProgressBar   pb
@ -288,6 +426,11 @@ def run(config):
    
    entries = input[1]
    o_view = output[1]
+    if 'taxoURI' in config['obi'] :   # TODO default None problem
+        taxo_uri = open_uri(config['obi']['taxoURI'])
+        taxo = taxo_uri[1]
+    else :
+        taxo = None

    # Initialize the progress bar
    pb = ProgressBar(len(entries), config, seconde=5)
@ -296,23 +439,10 @@ def run(config):
 #    if options.prefix:
 #        usm = uniqPrefixSequence
 #    else:
-    usm = uniqSequence
-    
-#     if 'taxoURI' in config['obi'] :   # TODO default None problem
-#         taxo = open_uri(config['obi']['taxoURI'])
-#     else :
-    taxo = None
+    usm = uniq_sequences
        
    usm(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])       

-#     if 'merge' in config['uniq'] :
-#         merged_keys=set(config['uniq']['merge'])
-#     else:
-#         merged_keys=set() 
-#       
-#     if 'taxo' in config['obi'] :
-#         merged_keys.add('taxid')
-             
    print("\n")
    print(repr(o_view))