From 73bca6288f0788f92daf67d58a2a0417ff2726e1 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Sun, 20 Aug 2017 18:04:21 +0200 Subject: [PATCH] New obi uniq --- python/obitools3/commands/uniq.pxd | 8 + python/obitools3/commands/uniq.pyx | 371 ++++++++++++++--------------- 2 files changed, 184 insertions(+), 195 deletions(-) create mode 100644 python/obitools3/commands/uniq.pxd diff --git a/python/obitools3/commands/uniq.pxd b/python/obitools3/commands/uniq.pxd new file mode 100644 index 0000000..3452e17 --- /dev/null +++ b/python/obitools3/commands/uniq.pxd @@ -0,0 +1,8 @@ +#cython: language_level=3 + +from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport +from obitools3.dms.taxo.taxo cimport Taxonomy +from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS + + +cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*) diff --git a/python/obitools3/commands/uniq.pyx b/python/obitools3/commands/uniq.pyx index 82d1884..37ff7a5 100644 --- a/python/obitools3/commands/uniq.pyx +++ b/python/obitools3/commands/uniq.pyx @@ -1,59 +1,29 @@ #cython: language_level=3 from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport -from obitools3.dms.dms import DMS # TODO cimport doesn't work -from obitools3.dms.view.view import View # TODO cimport doesn't work -from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS -from obitools3.dms.obiseq cimport Nuc_Seq -from obitools3.dms.column.column cimport Column -from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN -from obitools3.dms.capi.obitypes cimport OBI_INT -from obitools3.utils cimport tostr +from obitools3.dms.dms cimport DMS +from obitools3.dms.taxo.taxo cimport Taxonomy +from obitools3.dms.view.view cimport View, Line +from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS +from obitools3.dms.column.column cimport Column, Column_line +from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN +from obitools3.dms.capi.obitypes cimport OBI_INT, index_t +from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption +from obitools3.uri.decode import open_uri +from obitools3.apps.config import logger # TODO silence non-implemented options __title__="Groups records together" -default_config = { 'inputview' : None, - 'outputview' : None - } + def addOptions(parser): - # TODO put this common group somewhere else but I don't know where - group=parser.add_argument_group('DMS and view options') + addSequenceInputOption(parser) + addMinimalOutputOption(parser) - group.add_argument('--default-dms','-d', - action="store", dest="obi:defaultdms", - metavar='', - default=None, - type=str, - help="Name of the default DMS for reading and writing data.") - - group.add_argument('--input-view','-i', - action="store", dest="obi:inputview", - metavar='', - default=None, - type=str, - help="Name of the input view, either raw if the view is in the default DMS," - " or in the form 'dms:view' if it is in another DMS.") - - group.add_argument('--output-view','-o', - action="store", dest="obi:outputview", - metavar='', - default=None, - type=str, - help="Name of the output view, either raw if the view is in the default DMS," - " or in the form 'dms:view' if it is in another DMS.") - - group.add_argument('--taxo','-t', - action="store", dest="obi:taxo", - metavar='', - default='', # TODO not None because if it's None, the option is not entered in the option dictionary. - type=str, - help="Name of the taxonomy to use.") - group = parser.add_argument_group('obi uniq specific options') @@ -89,152 +59,198 @@ def addOptions(parser): # TODO taxonomy -# TODO -COUNT_COLUMN_str = tostr(COUNT_COLUMN) - - -def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) : +cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) : + cdef int i + cdef int o_idx + cdef int u_idx + cdef int u_id + cdef int i_count + cdef set mergedKeys + cdef dict uniques + cdef dict merged_infos + cdef object iter_view + cdef Line i_seq + cdef Line o_seq + cdef str key + cdef bytes key_b + cdef str mkey + cdef str merged_col_name + cdef Column i_col + cdef Column seq_col + cdef object to_merge + cdef Column_line mcol + cdef Column_line i_mcol + uniques = {} if categories is None: categories=[] - if mergedKey is not None: - mergedKey=set(mergedKey) + if mergedKeys_list is not None: + mergedKeys=set(mergedKeys_list) else: - mergedKey=set() + mergedKeys=set() - if taxonomy is not None: - mergedKey.add('taxid') +# if taxonomy is not None: +# mergedKeys.add('taxid') - # Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure - - o_idx = 0 + # Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names + #logger("info", "obi uniq", "First browsing through the input") + merged_infos = {} i = 0 - seq_col = view[NUC_SEQUENCE_COLUMN] - iter_view = iter(view) - for i_seq in iter_view : - pass + for i_seq in iter_view: + pb(i) + for key in mergedKeys: + mkey = "merged_%s" % key + if key in i_seq: # TODO what if mkey already in i_seq? + if mkey not in merged_infos: + merged_infos[mkey] = {} + mkey_infos = merged_infos[mkey] + mkey_infos['nb_elts'] = 1 + mkey_infos['elt_names'] = [i_seq[key]] + else: + mkey_infos = merged_infos[mkey] + if i_seq[key] not in mkey_infos['elt_names']: # TODO make faster? but how? + mkey_infos['elt_names'].append(i_seq[key]) + mkey_infos['nb_elts'] += 1 + i+=1 + + for key in mergedKeys: + merged_col_name = "merged_%s" % key + i_col = view[key] + Column.new_column(o_view, + merged_col_name, + OBI_INT, + nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'], + elements_names=merged_infos[merged_col_name]['elt_names'], + comments=i_col.comments, + alias=merged_col_name # TODO what if it already exists + ) + del(merged_infos) + + #logger("info", "obi uniq", "Second browsing through the input") + i = 0 + o_idx = 0 + seq_col = view[NUC_SEQUENCE_COLUMN] + iter_view = iter(view) for i_seq in iter_view : pb(i) - # utiliser l'index des AVLs, faire l'API - #u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),) + + #u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),) u_id = seq_col.get_line_idx(i) + if u_id in uniques: - u_seq = uniques[u_id] - o_seq = o_view[u_seq['idx']] - if COUNT_COLUMN_str in i_seq: - o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str] + + if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None: + i_count = 1 else: - o_seq[COUNT_COLUMN_str] += 1 -# seq['COUNT']=1 + i_count = i_seq[COUNT_COLUMN] + + u_idx = uniques[u_id] + o_seq = o_view[u_idx] + o_seq[COUNT_COLUMN] += i_count + # if taxonomy is not None and 'taxid' in seq: # s['merged_taxid'][seq['taxid']]= - for key in mergedKey: - if key=='taxid' and mergeIds: # TODO - if 'taxid_dist' in i_seq: - u_seq["taxid_dist"].update(i_seq["taxid_dist"]) - if 'taxid' in i_seq: - u_seq["taxid_dist"][i_seq.id] = i_seq['taxid'] - - mkey = "merged_%s" % key + for key in mergedKeys: +# if key=='taxid' and mergeIds: # TODO +# if 'taxid_dist' in i_seq: +# u_seq["taxid_dist"].update(i_seq["taxid_dist"]) +# if 'taxid' in i_seq: +# u_seq["taxid_dist"][i_seq.id] = i_seq['taxid'] + + mkey = "merged_%s" % key #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive if key in i_seq: - u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str] + to_merge = i_seq[key] + mcol = o_seq[mkey] + if mcol[to_merge] is None: + mcol[to_merge] = i_count + else: + mcol[to_merge] = mcol[to_merge] + i_count #cas ou merged_keys existe deja - else: + else: # TODO is this a good else if mkey in i_seq: - for skey in i_seq[mkey]: - u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey] - - for key in i_seq.keys(): + mcol = o_seq[mkey] + i_mcol = i_seq[mkey] + for key_b in i_mcol: + if mcol[key_b] is None: + mcol[key_b] = i_mcol[key_b] + else: + mcol[key_b] = mcol[key_b] + i_mcol[key_b] + + for key_b in i_seq.keys(): # Merger proprement l'attribut merged s'il existe - if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this - o_seq[key] = None - - if mergeIds: - u_seq['merged'].append(i_seq.id) - - else: + # TODO make special columns list? + if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] : + o_seq[key_b] = None + + # if mergeIds: # TODO + # u_seq['merged'].append(i_seq.id) + else: o_view[o_idx] = i_seq o_seq = o_view[o_idx] - uniques[u_id] = {'idx':o_idx} - u_seq = uniques[u_id] + uniques[u_id] = o_idx o_idx += 1 - if COUNT_COLUMN_str not in o_seq: - o_seq[COUNT_COLUMN_str] = 1 - - for key in mergedKey: - if key=='taxid' and mergeIds: - if 'taxid' in o_seq and 'taxid_dist' not in o_seq: - u_seq["taxid_dist"] = {} - else : - u_seq["taxid_dist"] = o_seq["taxid_dist"] - if 'taxid' in o_seq: - u_seq["taxid_dist"][o_seq.id] = o_seq['taxid'] - mkey = "merged_%s" % key - if mkey not in o_seq: - u_seq[mkey]={} - else : - u_seq[mkey] = o_seq[mkey] + if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None: + o_seq[COUNT_COLUMN] = 1 + + for key in mergedKeys: +# if key=='taxid' and mergeIds: +# if 'taxid' in o_seq and 'taxid_dist' not in o_seq: +# u_seq["taxid_dist"] = {} +# else : +# u_seq["taxid_dist"] = o_seq["taxid_dist"] +# if 'taxid' in o_seq: +# u_seq["taxid_dist"][o_seq.id] = o_seq['taxid'] + mkey = "merged_%s" % key if key in o_seq: - u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str] - o_seq[key] = None - - if mergeIds: - u_seq['merged']=[o_seq.id] - + to_merge = o_seq[key] + mcol = o_seq[mkey] + if to_merge in mcol and mcol[to_merge] is not None: + mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN] + else: + mcol[to_merge] = o_seq[COUNT_COLUMN] + o_seq[key] = None # TODO delete column eventually -> make C function? + +# if mergeIds: +# u_seq['merged']=[o_seq.id] i+=1 - + #TODO #if taxonomy is not None: # mergeTaxonomyClassification(uniqSeq, taxonomy) - # Get informations to build the columns with merged attributes - merged_infos = {} - for u_id in uniques : - u_seq = uniques[u_id] - for mkey in u_seq : - if mkey != 'idx' : - mkey_dict = u_seq[mkey] - if mkey not in merged_infos : - merged_infos[mkey] = {} - mkey_infos = merged_infos[mkey] - mkey_infos['nb_elts'] = len(mkey_dict.keys()) - mkey_infos['elt_names'] = [k for k in mkey_dict] - else : - mkey_infos = merged_infos[mkey] - for k in mkey_dict : - if k not in mkey_infos['elt_names'] : - mkey_infos['elt_names'].append(k) - mkey_infos['nb_elts'] += 1 - - keys_to_del = [] - for k in merged_infos : - if merged_infos[k]['nb_elts'] == 0: - keys_to_del.append(k) - for k in keys_to_del : - del merged_infos[k] - - return (uniques, merged_infos) def run(config): - # TODO declare variables + cdef tuple input + cdef tuple output + cdef View_NUC_SEQS entries + cdef View_NUC_SEQS o_view + cdef ProgressBar pb + + logger("info","obi uniq") - # Open DMS - d = DMS.open(config['obi']['defaultdms']) - - # Open input view - entries = View.open(d, config['obi']['inputview']) - + input = open_uri(config['obi']['inputURI']) + + if input[2] != View_NUC_SEQS: + raise NotImplementedError('obi uniq only works on NUC_SEQS views') + + output = open_uri(config['obi']['outputURI'], + input=False, + newviewtype=View_NUC_SEQS) + + entries = input[1] + o_view = output[1] + # Initialize the progress bar pb = ProgressBar(len(entries), config, seconde=5) @@ -243,55 +259,20 @@ def run(config): # usm = uniqPrefixSequence # else: usm = uniqSequence - - # Create output view - view_class = View.get_view_class(entries.type) - if view_class == View_NUC_SEQS : - get_quality = tostr(QUALITY_COLUMN) in entries # TODO - o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality) - else : - o_view = view_class.new(d, config['obi']['outputview']) - (uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories']) + usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories']) - if 'merge' in config['uniq'] : - merged_keys=set(config['uniq']['merge']) - else: - merged_keys=set() - - if 'taxo' in config['obi'] : - merged_keys.add('taxid') - - # TODO gotta handle special merged columns - for k in merged_keys: - merged_col_name = "merged_%s" % k - if merged_col_name in merged_infos : - i_col = entries[k] - Column.new_column(o_view, - merged_col_name, - OBI_INT, - nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'], - elements_names=merged_infos[merged_col_name]['elt_names'], - comments=i_col.comments, - alias=merged_col_name - ) - - for u_id in uniques: - u_dict = uniques[u_id] - for merged_k in u_dict : - if merged_k in merged_infos : # TODO don't enter irrelevant keys to begin with, instead - o_view[u_dict['idx']][merged_k] = u_dict[merged_k] +# if 'merge' in config['uniq'] : +# merged_keys=set(config['uniq']['merge']) +# else: +# merged_keys=set() +# +# if 'taxo' in config['obi'] : +# merged_keys.add('taxid') print("\n") print(repr(o_view)) - - d.close() - - - - - - - - - \ No newline at end of file + + input[0].close() + output[0].close() +