From 43f65e7fd059b56a723098572335e95cc380ffc3 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Thu, 26 Oct 2017 19:00:05 +0200 Subject: [PATCH] obi uniq: fixed bug where dictionary indexes were not read properly, and added view rollback in case of an exception. --- python/obitools3/commands/uniq.pyx | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/python/obitools3/commands/uniq.pyx b/python/obitools3/commands/uniq.pyx index 9cf65fd..db76ac7 100644 --- a/python/obitools3/commands/uniq.pyx +++ b/python/obitools3/commands/uniq.pyx @@ -3,6 +3,7 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.dms import DMS from obitools3.dms.view.view cimport View, Line +from obitools3.dms.view import RollbackException from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.column.column cimport Column, Column_line from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN @@ -44,16 +45,6 @@ def addOptions(parser): "used to group sequences before dereplication " "(option can be used several times).") -# TODO discuss -# group.add_argument('--prefix', '-p', -# action="store_true", dest="uniq:prefix", -# default=False, -# help="Dereplication is done based on prefix matching: " -# "(i) The shortest sequence of each group is a prefix " -# "of any sequence of its group (ii) Two shortest " -# "sequences of any couple of groups are not the" -# "prefix of the other one.") - cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) : @@ -300,7 +291,9 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li mkey = "merged_%s" % key #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive if key in i_seq: - to_merge = str(i_seq[key]) + to_merge = i_seq[key] + if type(to_merge) != bytes : + to_merge = str(to_merge) mcol = o_seq[mkey] if mcol[to_merge] is None: mcol[to_merge] = i_count @@ -325,7 +318,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li if mergeIds : merged_ids_dict[o_seq.id].append(i_seq.id) - #o_seq['merged'].append(i_seq.id) else: o_view[o_idx] = i_seq @@ -344,7 +336,9 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li taxid_dist_dict[i_seq.id][o_seq.id] = o_seq['taxid'] mkey = "merged_%s" % key if key in o_seq: - to_merge = str(o_seq[key]) + to_merge = o_seq[key] + if type(to_merge) != bytes : + to_merge = str(to_merge) mcol = o_seq[mkey] if to_merge in mcol and mcol[to_merge] is not None: mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN] @@ -354,7 +348,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li if mergeIds: merged_ids_dict[o_seq.id] = [o_seq.id] - #o_seq['merged']=[o_seq.id] i+=1 @@ -436,9 +429,12 @@ def run(config): # Initialize the progress bar pb = ProgressBar(len(entries), config, seconde=5) + + try: + uniq_sequences(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories']) + except: + raise RollbackException("obi uniq error, rollbacking view", o_view) - uniq_sequences(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories']) - print("\n") print(repr(o_view))