From 5ddd1d9ae6d10da1eaf05f1546883048905c32ae Mon Sep 17 00:00:00 2001
From: Celine Mercier <celine.mercier@metabarcoding.org>
Date: Wed, 4 Oct 2017 16:13:07 +0200
Subject: [PATCH] obi uniq: added taxonomy handling

---
 python/obitools3/commands/uniq.pxd |   3 +-
 python/obitools3/commands/uniq.pyx | 252 ++++++++++++++++++++++-------
 2 files changed, 193 insertions(+), 62 deletions(-)

diff --git a/python/obitools3/commands/uniq.pxd b/python/obitools3/commands/uniq.pxd
index 7402339..db21652 100644
--- a/python/obitools3/commands/uniq.pxd
+++ b/python/obitools3/commands/uniq.pxd
@@ -5,4 +5,5 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 
 
-cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)
+cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy)
+cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)
diff --git a/python/obitools3/commands/uniq.pyx b/python/obitools3/commands/uniq.pyx
index 907367e..189e34c 100644
--- a/python/obitools3/commands/uniq.pyx
+++ b/python/obitools3/commands/uniq.pyx
@@ -7,13 +7,11 @@ from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column, Column_line
 from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
 from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
-from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
+from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption, addTaxonomyInputOption
 from obitools3.uri.decode import open_uri
 from obitools3.apps.config import logger
 
 
-# TODO silence non-implemented options
-
 __title__="Groups records together"
  
 
@@ -22,7 +20,7 @@ def addOptions(parser):
  
     addSequenceInputOption(parser)
     addMinimalOutputOption(parser)
- 
+    addTaxonomyInputOption(parser)
  
     group = parser.add_argument_group('obi uniq specific options')
  
@@ -46,17 +44,138 @@ def addOptions(parser):
                              "used to group sequences before dereplication "
                              "(option can be used several times).")
 
-    group.add_argument('--prefix', '-p',
-                        action="store_true", dest="uniq:prefix",
-                        default=False,
-                        help="Dereplication is done based on prefix matching: "
-                             "(i) The shortest sequence of each group is a prefix "
-                             "of any sequence of its group (ii) Two shortest "
-                             "sequences of any couple of groups are not the"
-                             "prefix of the other one.")
+#     group.add_argument('--prefix', '-p',
+#                         action="store_true", dest="uniq:prefix",
+#                         default=False,
+#                         help="Dereplication is done based on prefix matching: "
+#                              "(i) The shortest sequence of each group is a prefix "
+#                              "of any sequence of its group (ii) Two shortest "
+#                              "sequences of any couple of groups are not the"
+#                              "prefix of the other one.")
 
 
-cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
+cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
+    
+    cdef int    taxid
+    cdef Line   seq
+    cdef list   m_taxids
+    cdef bytes  k
+    cdef object tsp
+    cdef object tgn
+    cdef object tfa
+    cdef object sp_sn
+    cdef object gn_sn
+    cdef object fa_sn
+
+    # Create columns
+    if 'species' in o_view and o_view['species'].data_type_int != OBI_INT :
+        o_view.delete_column('species')
+    if 'species' not in o_view:
+        Column.new_column(o_view, 
+                          'species', 
+                          OBI_INT
+                         )
+
+    if 'genus' in o_view and o_view['genus'].data_type_int != OBI_INT :
+        o_view.delete_column('genus')
+    if 'genus' not in o_view:
+        Column.new_column(o_view, 
+                          'genus', 
+                          OBI_INT
+                         )
+
+    if 'family' in o_view and o_view['family'].data_type_int != OBI_INT :
+        o_view.delete_column('family')
+    if 'family' not in o_view:
+        Column.new_column(o_view, 
+                          'family', 
+                          OBI_INT
+                         )
+
+    if 'species_name' in o_view and o_view['species_name'].data_type_int != OBI_STR :
+        o_view.delete_column('species_name')
+    if 'species_name' not in o_view:
+        Column.new_column(o_view, 
+                          'species_name', 
+                          OBI_STR
+                         )
+
+    if 'genus_name' in o_view and o_view['genus_name'].data_type_int != OBI_STR :
+        o_view.delete_column('genus_name')
+    if 'genus_name' not in o_view:
+        Column.new_column(o_view, 
+                          'genus_name', 
+                          OBI_STR
+                         )
+
+    if 'family_name' in o_view and o_view['family_name'].data_type_int != OBI_STR :
+        o_view.delete_column('family_name')
+    if 'family_name' not in o_view:
+        Column.new_column(o_view, 
+                          'family_name', 
+                          OBI_STR
+                         )
+
+    if 'rank' in o_view and o_view['rank'].data_type_int != OBI_STR :
+        o_view.delete_column('rank')
+    if 'rank' not in o_view:
+        Column.new_column(o_view, 
+                          'rank', 
+                          OBI_STR
+                         )
+
+    if 'scientific_name' in o_view and o_view['scientific_name'].data_type_int != OBI_STR :
+        o_view.delete_column('scientific_name')
+    if 'scientific_name' not in o_view:
+        Column.new_column(o_view, 
+                          'scientific_name', 
+                          OBI_STR
+                         )
+
+    for seq in o_view:
+        
+        if 'merged_taxid' in seq :
+            m_taxids = []
+            for k in seq['merged_taxid'].keys() :
+                if seq['merged_taxid'][k] is not None:
+                    m_taxids.append(int(k))
+            taxid = taxonomy.last_common_taxon(*m_taxids)
+            seq['taxid'] = taxid
+            tsp = taxonomy.get_species(taxid)
+            tgn = taxonomy.get_genus(taxid)
+            tfa = taxonomy.get_family(taxid)
+            
+            if tsp is not None:
+                sp_sn = taxonomy.get_scientific_name(tsp)
+            else:
+                sp_sn = None   # TODO was '###', discuss
+                tsp = None     # TODO was '-1', discuss
+                
+            if tgn is not None:
+                gn_sn = taxonomy.get_scientific_name(tgn)
+            else:
+                gn_sn = None
+                tgn = None
+                
+            if tfa is not None:
+                fa_sn = taxonomy.get_scientific_name(tfa)
+            else:
+                fa_sn = None
+                tfa = None
+                
+            seq['species'] = tsp
+            seq['genus'] = tgn
+            seq['family'] = tfa
+                
+            seq['species_name'] = sp_sn
+            seq['genus_name'] = gn_sn
+            seq['family_name'] = fa_sn
+            
+            seq['rank'] = taxonomy.get_rank(taxid)
+            seq['scientific_name'] = taxonomy.get_scientific_name(taxid)
+
+
+cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
      
     cdef int            i
     cdef int            o_idx
@@ -80,14 +199,16 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
     cdef Column_line    i_mcol  
     cdef list           catl
     cdef dict           merged_ids_dict
-    
-    cdef bytes k
-    cdef int n
+    cdef int            nb_ids_max
+    cdef set            ids_list    
+    cdef bytes          k
+    cdef int            n
+    cdef dict           taxid_dist_dict
 
     #print(categories)
     
     uniques = {}
-    
+        
     if mergedKeys_list is not None:
         mergedKeys=set(mergedKeys_list)
     else:
@@ -95,10 +216,10 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
     
     if taxonomy is not None:
         mergedKeys.add('taxid')
-    
+        
     if categories is None:
         categories = []
-    
+        
     # Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
     logger("info", "First browsing through the input")
     merged_infos = {}
@@ -132,11 +253,12 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
                           comments=i_col.comments,
                           alias=merged_col_name     # TODO what if it already exists
                          )
-        
+         
     del(merged_infos)
     
     logger("info", "Second browsing through the input")
     merged_ids_dict = {}
+    taxid_dist_dict = {}
     i = 0
     o_idx = 0
     seq_col = view[NUC_SEQUENCE_COLUMN]
@@ -167,19 +289,17 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
             o_seq = o_view[u_idx]                    
             o_seq[COUNT_COLUMN] += i_count
                         
-#            if taxonomy is not None and 'taxid' in seq:
-#                s['merged_taxid'][seq['taxid']]=
             for key in mergedKeys:
-#                if key=='taxid' and mergeIds:   # TODO
-#                    if 'taxid_dist' in i_seq:
-#                        u_seq["taxid_dist"].update(i_seq["taxid_dist"])
-#                    if 'taxid' in i_seq:
-#                        u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
+                if key=='taxid' and mergeIds:
+                    if 'taxid_dist' in i_seq:
+                        taxid_dist_dict[o_seq.id].update(i_seq["taxid_dist"])
+                    if 'taxid' in i_seq:
+                        taxid_dist_dict[o_seq.id][i_seq.id] = i_seq['taxid']
                 
                 mkey = "merged_%s" % key
                 #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
                 if key in i_seq:
-                    to_merge = i_seq[key]
+                    to_merge = str(i_seq[key])
                     mcol = o_seq[mkey]
                     if mcol[to_merge] is None:
                         mcol[to_merge] = i_count
@@ -214,35 +334,36 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
 
             if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
                 o_seq[COUNT_COLUMN] = 1
-                        
+            
             for key in mergedKeys:
-#                 if key=='taxid' and mergeIds:
-#                     if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
-#                         u_seq["taxid_dist"] = {}
-#                     else :
-#                         u_seq["taxid_dist"] = o_seq["taxid_dist"]
-#                     if 'taxid' in o_seq:
-#                         u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
+                if key=='taxid' and mergeIds:
+                    if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
+                        taxid_dist_dict[i_seq.id] = {}
+                    if 'taxid' in o_seq:
+                        taxid_dist_dict[i_seq.id][o_seq.id] = o_seq['taxid']
                 mkey = "merged_%s" % key
                 if key in o_seq:
-                    to_merge = o_seq[key]
+                    to_merge = str(o_seq[key])
                     mcol = o_seq[mkey]
                     if to_merge in mcol and mcol[to_merge] is not None:
                         mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
                     else:
                         mcol[to_merge] = o_seq[COUNT_COLUMN]
-                    o_seq[key] = None   # TODO delete column eventually -> make C function?
+                    o_seq[key] = None   # TODO delete column eventually -> make C function that deletes 'empty' columns
                         
             if mergeIds:
-                merged_ids_dict[o_seq.id] = [o_seq.id]  # TODO check that this id is added too in the original obiuniq
+                merged_ids_dict[o_seq.id] = [o_seq.id]
                 #o_seq['merged']=[o_seq.id]
             
         i+=1 
     
     # Merged ids column
     if mergeIds :
+        logger("info", "Building merged ids column")
         nb_ids_max = 0
+        ids_list = set()
         for k in merged_ids_dict :
+            ids_list = set(list(ids_list) + merged_ids_dict[k])
             n = len(merged_ids_dict[k])
             if n > nb_ids_max :
                 nb_ids_max = n
@@ -259,9 +380,24 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
         for o_seq in o_view:
             o_seq['merged'] = merged_ids_dict[o_seq.id]
 
-    #TODO
-    #if taxonomy is not None:
-    #    mergeTaxonomyClassification(uniqSeq, taxonomy)
+    if mergeIds and 'taxid' in mergedKeys:
+        logger("info", "Building taxid dist column")
+        Column.new_column(o_view, 
+                          "taxid_dist", 
+                          OBI_INT, 
+                          nb_elements_per_line=len(ids_list),
+                          elements_names=list(ids_list),
+                          comments="obi uniq taxid dist",
+                          alias="taxid_dist"     # TODO what if it already exists
+                         )
+                
+        for o_seq in o_view:
+            if o_seq.id in taxid_dist_dict :
+                o_seq['taxid_dist'] = taxid_dist_dict[o_seq.id]
+
+    if taxonomy is not None:
+        logger("info", "Merging taxonomy classification")
+        merge_taxonomy_classification(o_view, taxonomy)
 
 
      
@@ -269,6 +405,8 @@ def run(config):
     
     cdef tuple         input
     cdef tuple         output 
+    cdef tuple         taxo_uri
+    cdef Taxonomy      taxo
     cdef View_NUC_SEQS entries
     cdef View_NUC_SEQS o_view
     cdef ProgressBar   pb
@@ -288,7 +426,12 @@ def run(config):
     
     entries = input[1]
     o_view = output[1]
-    
+    if 'taxoURI' in config['obi'] :   # TODO default None problem
+        taxo_uri = open_uri(config['obi']['taxoURI'])
+        taxo = taxo_uri[1]
+    else :
+        taxo = None
+
     # Initialize the progress bar
     pb = ProgressBar(len(entries), config, seconde=5)
 
@@ -296,26 +439,13 @@ def run(config):
 #    if options.prefix:
 #        usm = uniqPrefixSequence
 #    else:
-    usm = uniqSequence
-    
-#     if 'taxoURI' in config['obi'] :   # TODO default None problem
-#         taxo = open_uri(config['obi']['taxoURI'])
-#     else :
-    taxo = None
-
+    usm = uniq_sequences
+        
     usm(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])       
 
-#     if 'merge' in config['uniq'] :
-#         merged_keys=set(config['uniq']['merge'])
-#     else:
-#         merged_keys=set() 
-#       
-#     if 'taxo' in config['obi'] :
-#         merged_keys.add('taxid')
-             
     print("\n")
     print(repr(o_view))
-    
-    input[0].close()
-    output[0].close()     
+
+    input[0].close()
+    output[0].close()