From 601a2cfd7df2d24277677668a56a24bd5838738e Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Sat, 30 Mar 2019 20:34:53 +0100 Subject: [PATCH] obi uniq: various fixes... --- python/obitools3/commands/uniq.pyx | 36 ++++++++++++++++++------------ 1 file changed, 22 insertions(+), 14 deletions(-) mode change 100755 => 100644 python/obitools3/commands/uniq.pyx diff --git a/python/obitools3/commands/uniq.pyx b/python/obitools3/commands/uniq.pyx old mode 100755 new mode 100644 index 7f0be24..39abec9 --- a/python/obitools3/commands/uniq.pyx +++ b/python/obitools3/commands/uniq.pyx @@ -7,7 +7,7 @@ from obitools3.dms.obiseq cimport Nuc_Seq_Stored from obitools3.dms.view import RollbackException from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.column.column cimport Column, Column_line -from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN +from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption, addTaxonomyOption from obitools3.uri.decode import open_uri @@ -136,7 +136,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) : if m_taxids_dict[k] is not None: m_taxids.append(int(k)) taxid = taxonomy.last_common_taxon(*m_taxids) - seq[b"taxid"] = taxid + seq[TAXID_COLUMN] = taxid tsp = taxonomy.get_species(taxid) tgn = taxonomy.get_genus(taxid) tfa = taxonomy.get_family(taxid) @@ -227,7 +227,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li mergedKeys_set=set() if taxonomy is not None: - mergedKeys_set.add(b"taxid") + mergedKeys_set.add(TAXID_COLUMN) mergedKeys = list(mergedKeys_set) @@ -243,8 +243,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li # Keep columns that are going to be used a lot in variables i_seq_col = view[NUC_SEQUENCE_COLUMN] i_id_col = view[ID_COLUMN] - if b"taxid" in view: - i_taxid_col = view[b"taxid"] + if TAXID_COLUMN in view: + i_taxid_col = view[TAXID_COLUMN] if b"taxid_dist" in view: i_taxid_dist_col = view[b"taxid_dist"] @@ -278,6 +278,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li for k in range(k_count): key = mergedKeys[k] mkey = mergedKeys_m[k] +# if mkey in i_seq: # TODO +# if mkey not in merged_infos: +# merged_infos[mkey] = {} +# mkey_infos = merged_infos[mkey] +# mkey_infos['nb_elts'] = 1 +# mkey_infos['elt_names'] = [i_seq[key]] if key in i_seq: # TODO what if mkey already in i_seq? --> should update if mkey not in merged_infos: merged_infos[mkey] = {} @@ -321,14 +327,13 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li mkey_cols[merged_col_name] = o_view[merged_col_name] # taxid_dist column - if mergeIds and b"taxid" in mergedKeys: + if mergeIds and TAXID_COLUMN in mergedKeys: if len(view) > max_elts: #The number of different IDs corresponds to the number of sequences in the view str_merged_cols.append(b"taxid_dist") Column.new_column(o_view, b"taxid_dist", OBI_STR, to_eval=True, - comments=b"obi uniq taxid dist, stored as character strings to be read as dict", alias=b"taxid_dist" # TODO what if it already exists ) else: @@ -337,7 +342,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li OBI_INT, nb_elements_per_line=len(view), elements_names=[id for id in i_id_col], - comments=b"obi uniq taxid dist", alias=b"taxid_dist" # TODO what if it already exists ) @@ -350,7 +354,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li b"merged", OBI_STR, tuples=True, - comments=b"obi uniq merged ids", alias=b"merged" # TODO what if it already exists ) @@ -410,10 +413,10 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li key = mergedKeys[k] mkey = mergedKeys_m[k] - if key==b"taxid" and mergeIds: + if key==TAXID_COLUMN and mergeIds: if b"taxid_dist" in i_seq: taxid_dist_dict.update(i_taxid_dist_col[i_idx]) - if b"taxid" in i_seq: + if TAXID_COLUMN in i_seq: taxid_dist_dict[i_id] = i_taxid_col[i_idx] #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive @@ -441,7 +444,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li mcol[key2] = mcol[key2] + i_mcol[key2] # Write taxid_dist - if mergeIds and b"taxid" in mergedKeys: + if mergeIds and TAXID_COLUMN in mergedKeys: if b"taxid_dist" in str_merged_cols: o_taxid_dist_col[o_idx] = str(taxid_dist_dict) else: @@ -524,6 +527,9 @@ def run(config): except Exception, e: raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view) + pb(len(entries), force=True) + print("", file=sys.stderr) + # Save command config in View and DMS comments command_line = " ".join(sys.argv[1:]) input_dms_name=[input[0].name] @@ -534,9 +540,11 @@ def run(config): o_view.write_config(config, "uniq", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name) output[0].record_command_line(command_line) - print("\n") - print(repr(o_view)) + #print("\n\nOutput view:\n````````````", file=sys.stderr) + #print(repr(o_view), file=sys.stderr) input[0].close() output[0].close() + logger("info", "Done.") +