From b2fc1f461196800a210357cf7774ecb434a6ff7d Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Thu, 27 Jul 2017 19:40:19 +0200 Subject: [PATCH] obi uniq: first version --- python/obitools3/commands/uniq.pyx | 297 +++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 python/obitools3/commands/uniq.pyx diff --git a/python/obitools3/commands/uniq.pyx b/python/obitools3/commands/uniq.pyx new file mode 100644 index 0000000..82d1884 --- /dev/null +++ b/python/obitools3/commands/uniq.pyx @@ -0,0 +1,297 @@ +#cython: language_level=3 + +from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport +from obitools3.dms.dms import DMS # TODO cimport doesn't work +from obitools3.dms.view.view import View # TODO cimport doesn't work +from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS +from obitools3.dms.obiseq cimport Nuc_Seq +from obitools3.dms.column.column cimport Column +from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN +from obitools3.dms.capi.obitypes cimport OBI_INT +from obitools3.utils cimport tostr + + +# TODO silence non-implemented options + +__title__="Groups records together" + +default_config = { 'inputview' : None, + 'outputview' : None + } + +def addOptions(parser): + + # TODO put this common group somewhere else but I don't know where + group=parser.add_argument_group('DMS and view options') + + group.add_argument('--default-dms','-d', + action="store", dest="obi:defaultdms", + metavar='', + default=None, + type=str, + help="Name of the default DMS for reading and writing data.") + + group.add_argument('--input-view','-i', + action="store", dest="obi:inputview", + metavar='', + default=None, + type=str, + help="Name of the input view, either raw if the view is in the default DMS," + " or in the form 'dms:view' if it is in another DMS.") + + group.add_argument('--output-view','-o', + action="store", dest="obi:outputview", + metavar='', + default=None, + type=str, + help="Name of the output view, either raw if the view is in the default DMS," + " or in the form 'dms:view' if it is in another DMS.") + + group.add_argument('--taxo','-t', + action="store", dest="obi:taxo", + metavar='', + default='', # TODO not None because if it's None, the option is not entered in the option dictionary. + type=str, + help="Name of the taxonomy to use.") + + + group = parser.add_argument_group('obi uniq specific options') + + group.add_argument('--merge', '-m', + action="append", dest="uniq:merge", + metavar="", + default=[], + type=str, + help="Attributes to merge.") # TODO must be a 1 elt/line column + + group.add_argument('--merge-ids', '-e', + action="store_true", dest="uniq:mergeids", + default=False, + help="Add the merged key with all ids of merged sequences.") + + group.add_argument('--category-attribute', '-c', + action="append", dest="uniq:categories", + metavar="", + default=[], + help="Add one attribute to the list of attributes " + "used to group sequences before dereplication " + "(option can be used several times).") + + group.add_argument('--prefix', '-p', + action="store_true", dest="uniq:prefix", + default=False, + help="Dereplication is done based on prefix matching: " + "(i) The shortest sequence of each group is a prefix " + "of any sequence of its group (ii) Two shortest " + "sequences of any couple of groups are not the" + "prefix of the other one.") + +# TODO taxonomy + + +# TODO +COUNT_COLUMN_str = tostr(COUNT_COLUMN) + + +def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) : + + uniques = {} + + if categories is None: + categories=[] + + if mergedKey is not None: + mergedKey=set(mergedKey) + else: + mergedKey=set() + + if taxonomy is not None: + mergedKey.add('taxid') + + # Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure + + o_idx = 0 + i = 0 + seq_col = view[NUC_SEQUENCE_COLUMN] + + iter_view = iter(view) + for i_seq in iter_view : + pass + + iter_view = iter(view) + for i_seq in iter_view : + pb(i) + # utiliser l'index des AVLs, faire l'API + #u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),) + u_id = seq_col.get_line_idx(i) + if u_id in uniques: + u_seq = uniques[u_id] + o_seq = o_view[u_seq['idx']] + if COUNT_COLUMN_str in i_seq: + o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str] + else: + o_seq[COUNT_COLUMN_str] += 1 +# seq['COUNT']=1 +# if taxonomy is not None and 'taxid' in seq: +# s['merged_taxid'][seq['taxid']]= + for key in mergedKey: + if key=='taxid' and mergeIds: # TODO + if 'taxid_dist' in i_seq: + u_seq["taxid_dist"].update(i_seq["taxid_dist"]) + if 'taxid' in i_seq: + u_seq["taxid_dist"][i_seq.id] = i_seq['taxid'] + + mkey = "merged_%s" % key + #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive + if key in i_seq: + u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str] + #cas ou merged_keys existe deja + else: + if mkey in i_seq: + for skey in i_seq[mkey]: + u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey] + + for key in i_seq.keys(): + # Merger proprement l'attribut merged s'il existe + if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this + o_seq[key] = None + + if mergeIds: + u_seq['merged'].append(i_seq.id) + + else: + + o_view[o_idx] = i_seq + o_seq = o_view[o_idx] + uniques[u_id] = {'idx':o_idx} + u_seq = uniques[u_id] + o_idx += 1 + + if COUNT_COLUMN_str not in o_seq: + o_seq[COUNT_COLUMN_str] = 1 + + for key in mergedKey: + if key=='taxid' and mergeIds: + if 'taxid' in o_seq and 'taxid_dist' not in o_seq: + u_seq["taxid_dist"] = {} + else : + u_seq["taxid_dist"] = o_seq["taxid_dist"] + if 'taxid' in o_seq: + u_seq["taxid_dist"][o_seq.id] = o_seq['taxid'] + mkey = "merged_%s" % key + if mkey not in o_seq: + u_seq[mkey]={} + else : + u_seq[mkey] = o_seq[mkey] + if key in o_seq: + u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str] + o_seq[key] = None + + if mergeIds: + u_seq['merged']=[o_seq.id] + + i+=1 + + #TODO + #if taxonomy is not None: + # mergeTaxonomyClassification(uniqSeq, taxonomy) + + # Get informations to build the columns with merged attributes + merged_infos = {} + for u_id in uniques : + u_seq = uniques[u_id] + for mkey in u_seq : + if mkey != 'idx' : + mkey_dict = u_seq[mkey] + if mkey not in merged_infos : + merged_infos[mkey] = {} + mkey_infos = merged_infos[mkey] + mkey_infos['nb_elts'] = len(mkey_dict.keys()) + mkey_infos['elt_names'] = [k for k in mkey_dict] + else : + mkey_infos = merged_infos[mkey] + for k in mkey_dict : + if k not in mkey_infos['elt_names'] : + mkey_infos['elt_names'].append(k) + mkey_infos['nb_elts'] += 1 + + keys_to_del = [] + for k in merged_infos : + if merged_infos[k]['nb_elts'] == 0: + keys_to_del.append(k) + for k in keys_to_del : + del merged_infos[k] + + return (uniques, merged_infos) + + +def run(config): + + # TODO declare variables + + # Open DMS + d = DMS.open(config['obi']['defaultdms']) + + # Open input view + entries = View.open(d, config['obi']['inputview']) + + # Initialize the progress bar + pb = ProgressBar(len(entries), config, seconde=5) + +# TODO +# if options.prefix: +# usm = uniqPrefixSequence +# else: + usm = uniqSequence + + # Create output view + view_class = View.get_view_class(entries.type) + if view_class == View_NUC_SEQS : + get_quality = tostr(QUALITY_COLUMN) in entries # TODO + o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality) + else : + o_view = view_class.new(d, config['obi']['outputview']) + + (uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories']) + + if 'merge' in config['uniq'] : + merged_keys=set(config['uniq']['merge']) + else: + merged_keys=set() + + if 'taxo' in config['obi'] : + merged_keys.add('taxid') + + # TODO gotta handle special merged columns + for k in merged_keys: + merged_col_name = "merged_%s" % k + if merged_col_name in merged_infos : + i_col = entries[k] + Column.new_column(o_view, + merged_col_name, + OBI_INT, + nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'], + elements_names=merged_infos[merged_col_name]['elt_names'], + comments=i_col.comments, + alias=merged_col_name + ) + + for u_id in uniques: + u_dict = uniques[u_id] + for merged_k in u_dict : + if merged_k in merged_infos : # TODO don't enter irrelevant keys to begin with, instead + o_view[u_dict['idx']][merged_k] = u_dict[merged_k] + + print("\n") + print(repr(o_view)) + + d.close() + + + + + + + + + \ No newline at end of file