diff --git a/python/obitools3/commands/uniq.pyx b/python/obitools3/commands/uniq.pyx index b7db87a..907367e 100644 --- a/python/obitools3/commands/uniq.pyx +++ b/python/obitools3/commands/uniq.pyx @@ -6,7 +6,7 @@ from obitools3.dms.view.view cimport View, Line from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.column.column cimport Column, Column_line from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN -from obitools3.dms.capi.obitypes cimport OBI_INT, index_t +from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption from obitools3.uri.decode import open_uri from obitools3.apps.config import logger @@ -36,7 +36,7 @@ def addOptions(parser): group.add_argument('--merge-ids', '-e', action="store_true", dest="uniq:mergeids", default=False, - help="Add the merged key with all ids of merged sequences.") + help="ONLY WORKING ON SMALL SETS FOR NOW Add the merged key with all ids of merged sequences.") group.add_argument('--category-attribute', '-c', action="append", dest="uniq:categories", @@ -55,8 +55,6 @@ def addOptions(parser): "sequences of any couple of groups are not the" "prefix of the other one.") -# TODO taxonomy - cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) : @@ -81,7 +79,11 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list cdef Column_line mcol cdef Column_line i_mcol cdef list catl + cdef dict merged_ids_dict + cdef bytes k + cdef int n + #print(categories) uniques = {} @@ -130,10 +132,11 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list comments=i_col.comments, alias=merged_col_name # TODO what if it already exists ) - + del(merged_infos) logger("info", "Second browsing through the input") + merged_ids_dict = {} i = 0 o_idx = 0 seq_col = view[NUC_SEQUENCE_COLUMN] @@ -199,8 +202,9 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] : o_seq[key_b] = None - # if mergeIds: # TODO - # u_seq['merged'].append(i_seq.id) + if mergeIds : + merged_ids_dict[o_seq.id].append(i_seq.id) + #o_seq['merged'].append(i_seq.id) else: o_view[o_idx] = i_seq @@ -229,10 +233,32 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mcol[to_merge] = o_seq[COUNT_COLUMN] o_seq[key] = None # TODO delete column eventually -> make C function? -# if mergeIds: -# u_seq['merged']=[o_seq.id] + if mergeIds: + merged_ids_dict[o_seq.id] = [o_seq.id] # TODO check that this id is added too in the original obiuniq + #o_seq['merged']=[o_seq.id] + i+=1 - + + # Merged ids column + if mergeIds : + nb_ids_max = 0 + for k in merged_ids_dict : + n = len(merged_ids_dict[k]) + if n > nb_ids_max : + nb_ids_max = n + + Column.new_column(o_view, + "merged", + OBI_STR, + nb_elements_per_line=nb_ids_max, + elements_names=None, + comments="obi uniq merged ids", + alias="merged" # TODO what if it already exists + ) + + for o_seq in o_view: + o_seq['merged'] = merged_ids_dict[o_seq.id] + #TODO #if taxonomy is not None: # mergeTaxonomyClassification(uniqSeq, taxonomy)