From 5ddd1d9ae6d10da1eaf05f1546883048905c32ae Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 4 Oct 2017 16:13:07 +0200 Subject: [PATCH] obi uniq: added taxonomy handling --- python/obitools3/commands/uniq.pxd | 3 +- python/obitools3/commands/uniq.pyx | 252 ++++++++++++++++++++++------- 2 files changed, 193 insertions(+), 62 deletions(-) diff --git a/python/obitools3/commands/uniq.pxd b/python/obitools3/commands/uniq.pxd index 7402339..db21652 100644 --- a/python/obitools3/commands/uniq.pxd +++ b/python/obitools3/commands/uniq.pxd @@ -5,4 +5,5 @@ from obitools3.dms.taxo.taxo cimport Taxonomy from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS -cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*) +cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) +cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*) diff --git a/python/obitools3/commands/uniq.pyx b/python/obitools3/commands/uniq.pyx index 907367e..189e34c 100644 --- a/python/obitools3/commands/uniq.pyx +++ b/python/obitools3/commands/uniq.pyx @@ -7,13 +7,11 @@ from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.column.column cimport Column, Column_line from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t -from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption +from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption, addTaxonomyInputOption from obitools3.uri.decode import open_uri from obitools3.apps.config import logger -# TODO silence non-implemented options - __title__="Groups records together" @@ -22,7 +20,7 @@ def addOptions(parser): addSequenceInputOption(parser) addMinimalOutputOption(parser) - + addTaxonomyInputOption(parser) group = parser.add_argument_group('obi uniq specific options') @@ -46,17 +44,138 @@ def addOptions(parser): "used to group sequences before dereplication " "(option can be used several times).") - group.add_argument('--prefix', '-p', - action="store_true", dest="uniq:prefix", - default=False, - help="Dereplication is done based on prefix matching: " - "(i) The shortest sequence of each group is a prefix " - "of any sequence of its group (ii) Two shortest " - "sequences of any couple of groups are not the" - "prefix of the other one.") +# group.add_argument('--prefix', '-p', +# action="store_true", dest="uniq:prefix", +# default=False, +# help="Dereplication is done based on prefix matching: " +# "(i) The shortest sequence of each group is a prefix " +# "of any sequence of its group (ii) Two shortest " +# "sequences of any couple of groups are not the" +# "prefix of the other one.") -cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) : +cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) : + + cdef int taxid + cdef Line seq + cdef list m_taxids + cdef bytes k + cdef object tsp + cdef object tgn + cdef object tfa + cdef object sp_sn + cdef object gn_sn + cdef object fa_sn + + # Create columns + if 'species' in o_view and o_view['species'].data_type_int != OBI_INT : + o_view.delete_column('species') + if 'species' not in o_view: + Column.new_column(o_view, + 'species', + OBI_INT + ) + + if 'genus' in o_view and o_view['genus'].data_type_int != OBI_INT : + o_view.delete_column('genus') + if 'genus' not in o_view: + Column.new_column(o_view, + 'genus', + OBI_INT + ) + + if 'family' in o_view and o_view['family'].data_type_int != OBI_INT : + o_view.delete_column('family') + if 'family' not in o_view: + Column.new_column(o_view, + 'family', + OBI_INT + ) + + if 'species_name' in o_view and o_view['species_name'].data_type_int != OBI_STR : + o_view.delete_column('species_name') + if 'species_name' not in o_view: + Column.new_column(o_view, + 'species_name', + OBI_STR + ) + + if 'genus_name' in o_view and o_view['genus_name'].data_type_int != OBI_STR : + o_view.delete_column('genus_name') + if 'genus_name' not in o_view: + Column.new_column(o_view, + 'genus_name', + OBI_STR + ) + + if 'family_name' in o_view and o_view['family_name'].data_type_int != OBI_STR : + o_view.delete_column('family_name') + if 'family_name' not in o_view: + Column.new_column(o_view, + 'family_name', + OBI_STR + ) + + if 'rank' in o_view and o_view['rank'].data_type_int != OBI_STR : + o_view.delete_column('rank') + if 'rank' not in o_view: + Column.new_column(o_view, + 'rank', + OBI_STR + ) + + if 'scientific_name' in o_view and o_view['scientific_name'].data_type_int != OBI_STR : + o_view.delete_column('scientific_name') + if 'scientific_name' not in o_view: + Column.new_column(o_view, + 'scientific_name', + OBI_STR + ) + + for seq in o_view: + + if 'merged_taxid' in seq : + m_taxids = [] + for k in seq['merged_taxid'].keys() : + if seq['merged_taxid'][k] is not None: + m_taxids.append(int(k)) + taxid = taxonomy.last_common_taxon(*m_taxids) + seq['taxid'] = taxid + tsp = taxonomy.get_species(taxid) + tgn = taxonomy.get_genus(taxid) + tfa = taxonomy.get_family(taxid) + + if tsp is not None: + sp_sn = taxonomy.get_scientific_name(tsp) + else: + sp_sn = None # TODO was '###', discuss + tsp = None # TODO was '-1', discuss + + if tgn is not None: + gn_sn = taxonomy.get_scientific_name(tgn) + else: + gn_sn = None + tgn = None + + if tfa is not None: + fa_sn = taxonomy.get_scientific_name(tfa) + else: + fa_sn = None + tfa = None + + seq['species'] = tsp + seq['genus'] = tgn + seq['family'] = tfa + + seq['species_name'] = sp_sn + seq['genus_name'] = gn_sn + seq['family_name'] = fa_sn + + seq['rank'] = taxonomy.get_rank(taxid) + seq['scientific_name'] = taxonomy.get_scientific_name(taxid) + + +cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) : cdef int i cdef int o_idx @@ -80,14 +199,16 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list cdef Column_line i_mcol cdef list catl cdef dict merged_ids_dict - - cdef bytes k - cdef int n + cdef int nb_ids_max + cdef set ids_list + cdef bytes k + cdef int n + cdef dict taxid_dist_dict #print(categories) uniques = {} - + if mergedKeys_list is not None: mergedKeys=set(mergedKeys_list) else: @@ -95,10 +216,10 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list if taxonomy is not None: mergedKeys.add('taxid') - + if categories is None: categories = [] - + # Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names logger("info", "First browsing through the input") merged_infos = {} @@ -132,11 +253,12 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list comments=i_col.comments, alias=merged_col_name # TODO what if it already exists ) - + del(merged_infos) logger("info", "Second browsing through the input") merged_ids_dict = {} + taxid_dist_dict = {} i = 0 o_idx = 0 seq_col = view[NUC_SEQUENCE_COLUMN] @@ -167,19 +289,17 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list o_seq = o_view[u_idx] o_seq[COUNT_COLUMN] += i_count -# if taxonomy is not None and 'taxid' in seq: -# s['merged_taxid'][seq['taxid']]= for key in mergedKeys: -# if key=='taxid' and mergeIds: # TODO -# if 'taxid_dist' in i_seq: -# u_seq["taxid_dist"].update(i_seq["taxid_dist"]) -# if 'taxid' in i_seq: -# u_seq["taxid_dist"][i_seq.id] = i_seq['taxid'] + if key=='taxid' and mergeIds: + if 'taxid_dist' in i_seq: + taxid_dist_dict[o_seq.id].update(i_seq["taxid_dist"]) + if 'taxid' in i_seq: + taxid_dist_dict[o_seq.id][i_seq.id] = i_seq['taxid'] mkey = "merged_%s" % key #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive if key in i_seq: - to_merge = i_seq[key] + to_merge = str(i_seq[key]) mcol = o_seq[mkey] if mcol[to_merge] is None: mcol[to_merge] = i_count @@ -214,35 +334,36 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None: o_seq[COUNT_COLUMN] = 1 - + for key in mergedKeys: -# if key=='taxid' and mergeIds: -# if 'taxid' in o_seq and 'taxid_dist' not in o_seq: -# u_seq["taxid_dist"] = {} -# else : -# u_seq["taxid_dist"] = o_seq["taxid_dist"] -# if 'taxid' in o_seq: -# u_seq["taxid_dist"][o_seq.id] = o_seq['taxid'] + if key=='taxid' and mergeIds: + if 'taxid' in o_seq and 'taxid_dist' not in o_seq: + taxid_dist_dict[i_seq.id] = {} + if 'taxid' in o_seq: + taxid_dist_dict[i_seq.id][o_seq.id] = o_seq['taxid'] mkey = "merged_%s" % key if key in o_seq: - to_merge = o_seq[key] + to_merge = str(o_seq[key]) mcol = o_seq[mkey] if to_merge in mcol and mcol[to_merge] is not None: mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN] else: mcol[to_merge] = o_seq[COUNT_COLUMN] - o_seq[key] = None # TODO delete column eventually -> make C function? + o_seq[key] = None # TODO delete column eventually -> make C function that deletes 'empty' columns if mergeIds: - merged_ids_dict[o_seq.id] = [o_seq.id] # TODO check that this id is added too in the original obiuniq + merged_ids_dict[o_seq.id] = [o_seq.id] #o_seq['merged']=[o_seq.id] i+=1 # Merged ids column if mergeIds : + logger("info", "Building merged ids column") nb_ids_max = 0 + ids_list = set() for k in merged_ids_dict : + ids_list = set(list(ids_list) + merged_ids_dict[k]) n = len(merged_ids_dict[k]) if n > nb_ids_max : nb_ids_max = n @@ -259,9 +380,24 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list for o_seq in o_view: o_seq['merged'] = merged_ids_dict[o_seq.id] - #TODO - #if taxonomy is not None: - # mergeTaxonomyClassification(uniqSeq, taxonomy) + if mergeIds and 'taxid' in mergedKeys: + logger("info", "Building taxid dist column") + Column.new_column(o_view, + "taxid_dist", + OBI_INT, + nb_elements_per_line=len(ids_list), + elements_names=list(ids_list), + comments="obi uniq taxid dist", + alias="taxid_dist" # TODO what if it already exists + ) + + for o_seq in o_view: + if o_seq.id in taxid_dist_dict : + o_seq['taxid_dist'] = taxid_dist_dict[o_seq.id] + + if taxonomy is not None: + logger("info", "Merging taxonomy classification") + merge_taxonomy_classification(o_view, taxonomy) @@ -269,6 +405,8 @@ def run(config): cdef tuple input cdef tuple output + cdef tuple taxo_uri + cdef Taxonomy taxo cdef View_NUC_SEQS entries cdef View_NUC_SEQS o_view cdef ProgressBar pb @@ -288,7 +426,12 @@ def run(config): entries = input[1] o_view = output[1] - + if 'taxoURI' in config['obi'] : # TODO default None problem + taxo_uri = open_uri(config['obi']['taxoURI']) + taxo = taxo_uri[1] + else : + taxo = None + # Initialize the progress bar pb = ProgressBar(len(entries), config, seconde=5) @@ -296,26 +439,13 @@ def run(config): # if options.prefix: # usm = uniqPrefixSequence # else: - usm = uniqSequence - -# if 'taxoURI' in config['obi'] : # TODO default None problem -# taxo = open_uri(config['obi']['taxoURI']) -# else : - taxo = None - + usm = uniq_sequences + usm(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories']) -# if 'merge' in config['uniq'] : -# merged_keys=set(config['uniq']['merge']) -# else: -# merged_keys=set() -# -# if 'taxo' in config['obi'] : -# merged_keys.add('taxid') - print("\n") print(repr(o_view)) - - input[0].close() - output[0].close() + + input[0].close() + output[0].close()