diff --git a/python/obitools3/commands/uniq.pyx b/python/obitools3/commands/uniq.pyx index 39abec9..d733492 100644 --- a/python/obitools3/commands/uniq.pyx +++ b/python/obitools3/commands/uniq.pyx @@ -7,12 +7,13 @@ from obitools3.dms.obiseq cimport Nuc_Seq_Stored from obitools3.dms.view import RollbackException from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.column.column cimport Column, Column_line -from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN +from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN, \ + TAXID_DIST_COLUMN, MERGED_TAXID_COLUMN, MERGED_COLUMN, MERGED_PREFIX from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption, addTaxonomyOption from obitools3.uri.decode import open_uri from obitools3.apps.config import logger -from obitools3.utils cimport tobytes +from obitools3.utils cimport tobytes, tostr import sys @@ -34,12 +35,12 @@ def addOptions(parser): metavar="", default=[], type=str, - help="Attributes to merge.") # TODO must be a 1 elt/line column + help="Attributes to merge.") # note: must be a 1 elt/line column, but columns containing already merged information (name MERGED_*) are automatically remerged group.add_argument('--merge-ids', '-e', action="store_true", dest="uniq:mergeids", default=False, - help="ONLY WORKING ON SMALL SETS FOR NOW Add the merged key with all ids of merged sequences.") # TODO ? + help="Add the merged key with all ids of merged sequences.") group.add_argument('--category-attribute', '-c', action="append", dest="uniq:categories", @@ -129,9 +130,9 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) : ) for seq in o_view: - if b"merged_taxid" in seq : + if MERGED_TAXID_COLUMN in seq : m_taxids = [] - m_taxids_dict = seq[b"merged_taxid"] + m_taxids_dict = seq[MERGED_TAXID_COLUMN] for k in m_taxids_dict.keys() : if m_taxids_dict[k] is not None: m_taxids.append(int(k)) @@ -171,7 +172,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) : seq[b"scientific_name"] = taxonomy.get_scientific_name(taxid) -cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=10000) : +cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) : cdef int i cdef int k @@ -215,9 +216,18 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li cdef object iter_view cdef object mcol cdef object to_merge + cdef list merged_list uniques = {} + for column_name in view.keys(): + if column_name[:7] == b"MERGED_": + info_to_merge = column_name[7:] + if mergedKeys_list is not None: + mergedKeys_list.append(tostr(info_to_merge)) + else: + mergedKeys_list = [tostr(info_to_merge)] + mergedKeys_list_b = [] if mergedKeys_list is not None: for key_str in mergedKeys_list: @@ -230,12 +240,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li mergedKeys_set.add(TAXID_COLUMN) mergedKeys = list(mergedKeys_set) - + k_count = len(mergedKeys) mergedKeys_m = [] for k in range(k_count): - mergedKeys_m.append(b"merged_" + mergedKeys[k]) + mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k]) if categories is None: categories = [] @@ -245,8 +255,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li i_id_col = view[ID_COLUMN] if TAXID_COLUMN in view: i_taxid_col = view[TAXID_COLUMN] - if b"taxid_dist" in view: - i_taxid_dist_col = view[b"taxid_dist"] + if TAXID_DIST_COLUMN in view: + i_taxid_dist_col = view[TAXID_DIST_COLUMN] # First browsing @@ -278,13 +288,13 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li for k in range(k_count): key = mergedKeys[k] mkey = mergedKeys_m[k] -# if mkey in i_seq: # TODO -# if mkey not in merged_infos: -# merged_infos[mkey] = {} -# mkey_infos = merged_infos[mkey] -# mkey_infos['nb_elts'] = 1 -# mkey_infos['elt_names'] = [i_seq[key]] - if key in i_seq: # TODO what if mkey already in i_seq? --> should update + if mkey in i_seq: + if mkey not in merged_infos: + merged_infos[mkey] = {} + mkey_infos = merged_infos[mkey] + mkey_infos['nb_elts'] = view[mkey].nb_elements_per_line + mkey_infos['elt_names'] = view[mkey].elements_names + if key in i_seq: if mkey not in merged_infos: merged_infos[mkey] = {} mkey_infos = merged_infos[mkey] @@ -304,6 +314,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li key = mergedKeys[k] merged_col_name = mergedKeys_m[k] i_col = view[key] + if merged_infos[merged_col_name]['nb_elts'] > max_elts: str_merged_cols.append(merged_col_name) Column.new_column(o_view, @@ -311,7 +322,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li OBI_STR, to_eval=True, comments=i_col.comments, - alias=merged_col_name # TODO what if it already exists + alias=merged_col_name ) else: @@ -321,7 +332,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'], elements_names=list(merged_infos[merged_col_name]['elt_names']), comments=i_col.comments, - alias=merged_col_name # TODO what if it already exists + alias=merged_col_name ) mkey_cols[merged_col_name] = o_view[merged_col_name] @@ -329,41 +340,39 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li # taxid_dist column if mergeIds and TAXID_COLUMN in mergedKeys: if len(view) > max_elts: #The number of different IDs corresponds to the number of sequences in the view - str_merged_cols.append(b"taxid_dist") + str_merged_cols.append(TAXID_DIST_COLUMN) Column.new_column(o_view, - b"taxid_dist", + TAXID_DIST_COLUMN, OBI_STR, to_eval=True, - alias=b"taxid_dist" # TODO what if it already exists + alias=TAXID_DIST_COLUMN ) else: Column.new_column(o_view, - b"taxid_dist", + TAXID_DIST_COLUMN, OBI_INT, nb_elements_per_line=len(view), elements_names=[id for id in i_id_col], - alias=b"taxid_dist" # TODO what if it already exists + alias=TAXID_DIST_COLUMN ) - del(merged_infos) # Merged ids column if mergeIds : Column.new_column(o_view, - b"merged", + MERGED_COLUMN, OBI_STR, tuples=True, - alias=b"merged" # TODO what if it already exists + alias=MERGED_COLUMN ) - # Keep columns that are going to be used a lot in variables o_id_col = o_view[ID_COLUMN] - if b"taxid_dist" in o_view: - o_taxid_dist_col = o_view[b"taxid_dist"] - if b"merged" in o_view: - o_merged_col = o_view[b"merged"] + if TAXID_DIST_COLUMN in o_view: + o_taxid_dist_col = o_view[TAXID_DIST_COLUMN] + if MERGED_COLUMN in o_view: + o_merged_col = o_view[MERGED_COLUMN] print("\n") # TODO because in the middle of progress bar. Better solution? logger("info", "Second browsing through the input") @@ -383,11 +392,15 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li o_id = o_seq.id if mergeIds: - o_merged_col[o_idx] = [view[idx].id for idx in merged_sequences] + merged_list = [view[idx].id for idx in merged_sequences] + if MERGED_COLUMN in view: # merge all ids if there's already some merged ids info + merged_list.extend(view[MERGED_COLUMN][idx] for idx in merged_sequences) + merged_list = list(set(merged_list)) # deduplicate the list + o_merged_col[o_idx] = merged_list o_seq[COUNT_COLUMN] = 0 - if b"taxid_dist" in u_seq and i_taxid_dist_col[u_idx] is not None: + if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None: taxid_dist_dict = i_taxid_dist_col[u_idx] else: taxid_dist_dict = {} @@ -407,19 +420,19 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li i_count = i_seq[COUNT_COLUMN] o_seq[COUNT_COLUMN] += i_count - + for k in range(k_count): key = mergedKeys[k] mkey = mergedKeys_m[k] if key==TAXID_COLUMN and mergeIds: - if b"taxid_dist" in i_seq: + if TAXID_DIST_COLUMN in i_seq: taxid_dist_dict.update(i_taxid_dist_col[i_idx]) if TAXID_COLUMN in i_seq: taxid_dist_dict[i_id] = i_taxid_col[i_idx] - #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive + # merge relevant keys if key in i_seq: to_merge = i_seq[key] if to_merge is not None: @@ -431,21 +444,20 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li else: mcol[to_merge] = mcol[to_merge] + i_count o_seq[key] = None - #cas ou merged_keys existe deja - else: # TODO is this a good else - if mkey in i_seq: - mcol = merged_dict[mkey] - i_mcol = i_seq[mkey] - if i_mcol is not None: - for key2 in i_mcol: - if mcol[key2] is None: - mcol[key2] = i_mcol[key2] - else: - mcol[key2] = mcol[key2] + i_mcol[key2] + # merged infos already in seq: merge the merged infos + if mkey in i_seq: + mcol = merged_dict[mkey] # dict + i_mcol = i_seq[mkey] # column line + if i_mcol.is_NA() == False: + for key2 in i_mcol: + if key2 not in mcol: + mcol[key2] = i_mcol[key2] + else: + mcol[key2] = mcol[key2] + i_mcol[key2] # Write taxid_dist if mergeIds and TAXID_COLUMN in mergedKeys: - if b"taxid_dist" in str_merged_cols: + if TAXID_DIST_COLUMN in str_merged_cols: o_taxid_dist_col[o_idx] = str(taxid_dist_dict) else: o_taxid_dist_col[o_idx] = taxid_dist_dict @@ -456,7 +468,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li mkey_cols[mkey][o_idx] = str(merged_dict[mkey]) else: mkey_cols[mkey][o_idx] = merged_dict[mkey] - # Sets NA values to 0 # TODO discuss, maybe keep as None and test for None instead of testing for 0 in tools + # Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools #for key in mkey_cols[mkey][o_idx]: # if mkey_cols[mkey][o_idx][key] is None: # mkey_cols[mkey][o_idx][key] = 0 @@ -464,7 +476,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li for key in i_seq.keys(): # Delete informations that differ between the merged sequences # TODO make special columns list? - if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] : + if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \ + and key not in merged_dict : o_seq[key] = None o_idx += 1 diff --git a/python/obitools3/dms/capi/obiview.pxd b/python/obitools3/dms/capi/obiview.pxd index 87a6dcf..0ae2811 100755 --- a/python/obitools3/dms/capi/obiview.pxd +++ b/python/obitools3/dms/capi/obiview.pxd @@ -26,6 +26,10 @@ cdef extern from "obiview.h" nogil: extern const_char_p QUALITY_COLUMN extern const_char_p COUNT_COLUMN extern const_char_p TAXID_COLUMN + extern const_char_p MERGED_TAXID_COLUMN + extern const_char_p MERGED_PREFIX + extern const_char_p TAXID_DIST_COLUMN + extern const_char_p MERGED_COLUMN struct Alias_column_pair_t : Column_reference_t column_refs diff --git a/src/obiview.h b/src/obiview.h index be4b071..b7e4f5f 100755 --- a/src/obiview.h +++ b/src/obiview.h @@ -57,9 +57,14 @@ */ #define TAXID_COLUMN "TAXID" /**< The name of the column containing the taxids. TODO subtype of INT column? */ -#define MERGED_TAXID_COLUMN "merged_TAXID" /**< The name of the column containing the merged taxids information. - */ // TODO maybe mix of lower and uppercase == bad - +#define MERGED_TAXID_COLUMN "MERGED_TAXID" /**< The name of the column containing the merged taxids information. + */ +#define MERGED_PREFIX "MERGED_" /**< The prefix to prepend to column names when merging informations during obi uniq. + */ +#define TAXID_DIST_COLUMN "TAXID_DIST" /**< The name of the column containing a dictionary of taxid:[list of ids] when merging informations during obi uniq. + */ +#define MERGED_COLUMN "MERGED" /**< The name of the column containing a list of ids when merging informations during obi uniq. + */ #define ID_PREFIX "seq" /**< The default prefix of sequence identifiers in automatic ID columns. */ #define PREDICATE_KEY "predicates" /**< The key used in the json-formatted view comments to store predicates.