obi uniq: fixed a bug where merged values were wrongly reinitialized

This commit is contained in:
Celine Mercier
2018-05-17 14:58:15 +02:00
parent 31d8ba5085
commit b91b3176b0

View File

@ -15,7 +15,7 @@ from obitools3.apps.config import logger
from obitools3.utils cimport tobytes from obitools3.utils cimport tobytes
__title__="Groups sequence records together" __title__="Group sequence records together"
@ -278,7 +278,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
for k in range(k_count): for k in range(k_count):
key = mergedKeys[k] key = mergedKeys[k]
mkey = mergedKeys_m[k] mkey = mergedKeys_m[k]
if key in i_seq: # TODO what if mkey already in i_seq? if key in i_seq: # TODO what if mkey already in i_seq? --> should update
if mkey not in merged_infos: if mkey not in merged_infos:
merged_infos[mkey] = {} merged_infos[mkey] = {}
mkey_infos = merged_infos[mkey] mkey_infos = merged_infos[mkey]
@ -388,8 +388,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
else: else:
taxid_dist_dict = {} taxid_dist_dict = {}
merged_dict = {}
for mkey in mergedKeys_m:
merged_dict[mkey] = {}
for i_idx in merged_sequences: for i_idx in merged_sequences:
i_id = i_id_col[i_idx] i_id = i_id_col[i_idx]
i_seq = view[i_idx] i_seq = view[i_idx]
@ -399,13 +403,9 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
i_count = i_seq[COUNT_COLUMN] i_count = i_seq[COUNT_COLUMN]
o_seq[COUNT_COLUMN] += i_count o_seq[COUNT_COLUMN] += i_count
merged_dict = {}
for mkey in mergedKeys_m:
merged_dict[mkey] = {}
for k in range(k_count): for k in range(k_count):
key = mergedKeys[k] key = mergedKeys[k]
mkey = mergedKeys_m[k] mkey = mergedKeys_m[k]
@ -419,10 +419,10 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
if key in i_seq: if key in i_seq:
to_merge = i_seq[key] to_merge = i_seq[key]
if to_merge is not None: if to_merge is not None:
if type(to_merge) != bytes : if type(to_merge) != bytes:
to_merge = tobytes(str(to_merge)) to_merge = tobytes(str(to_merge))
mcol = merged_dict[mkey] mcol = merged_dict[mkey]
if to_merge not in mcol or mcol[to_merge] is None: if to_merge not in mcol or mcol[to_merge] is None:
mcol[to_merge] = i_count mcol[to_merge] = i_count
else: else:
mcol[to_merge] = mcol[to_merge] + i_count mcol[to_merge] = mcol[to_merge] + i_count
@ -452,6 +452,10 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
mkey_cols[mkey][o_idx] = str(merged_dict[mkey]) mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
else: else:
mkey_cols[mkey][o_idx] = merged_dict[mkey] mkey_cols[mkey][o_idx] = merged_dict[mkey]
# Sets NA values to 0 # TODO discuss, maybe keep as None and test for None instead of testing for 0 in tools
#for key in mkey_cols[mkey][o_idx]:
# if mkey_cols[mkey][o_idx][key] is None:
# mkey_cols[mkey][o_idx][key] = 0
for key in i_seq.keys(): for key in i_seq.keys():
# Delete informations that differ between the merged sequences # Delete informations that differ between the merged sequences