obi uniq: various fixes...

This commit is contained in:
Celine Mercier
2019-03-30 20:34:53 +01:00
parent 7c518300a0
commit 601a2cfd7d

36
python/obitools3/commands/uniq.pyx Executable file → Normal file
View File

@ -7,7 +7,7 @@ from obitools3.dms.obiseq cimport Nuc_Seq_Stored
from obitools3.dms.view import RollbackException
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column, Column_line
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption, addTaxonomyOption
from obitools3.uri.decode import open_uri
@ -136,7 +136,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
if m_taxids_dict[k] is not None:
m_taxids.append(int(k))
taxid = taxonomy.last_common_taxon(*m_taxids)
seq[b"taxid"] = taxid
seq[TAXID_COLUMN] = taxid
tsp = taxonomy.get_species(taxid)
tgn = taxonomy.get_genus(taxid)
tfa = taxonomy.get_family(taxid)
@ -227,7 +227,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
mergedKeys_set=set()
if taxonomy is not None:
mergedKeys_set.add(b"taxid")
mergedKeys_set.add(TAXID_COLUMN)
mergedKeys = list(mergedKeys_set)
@ -243,8 +243,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
# Keep columns that are going to be used a lot in variables
i_seq_col = view[NUC_SEQUENCE_COLUMN]
i_id_col = view[ID_COLUMN]
if b"taxid" in view:
i_taxid_col = view[b"taxid"]
if TAXID_COLUMN in view:
i_taxid_col = view[TAXID_COLUMN]
if b"taxid_dist" in view:
i_taxid_dist_col = view[b"taxid_dist"]
@ -278,6 +278,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
for k in range(k_count):
key = mergedKeys[k]
mkey = mergedKeys_m[k]
# if mkey in i_seq: # TODO
# if mkey not in merged_infos:
# merged_infos[mkey] = {}
# mkey_infos = merged_infos[mkey]
# mkey_infos['nb_elts'] = 1
# mkey_infos['elt_names'] = [i_seq[key]]
if key in i_seq: # TODO what if mkey already in i_seq? --> should update
if mkey not in merged_infos:
merged_infos[mkey] = {}
@ -321,14 +327,13 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
mkey_cols[merged_col_name] = o_view[merged_col_name]
# taxid_dist column
if mergeIds and b"taxid" in mergedKeys:
if mergeIds and TAXID_COLUMN in mergedKeys:
if len(view) > max_elts: #The number of different IDs corresponds to the number of sequences in the view
str_merged_cols.append(b"taxid_dist")
Column.new_column(o_view,
b"taxid_dist",
OBI_STR,
to_eval=True,
comments=b"obi uniq taxid dist, stored as character strings to be read as dict",
alias=b"taxid_dist" # TODO what if it already exists
)
else:
@ -337,7 +342,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
OBI_INT,
nb_elements_per_line=len(view),
elements_names=[id for id in i_id_col],
comments=b"obi uniq taxid dist",
alias=b"taxid_dist" # TODO what if it already exists
)
@ -350,7 +354,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
b"merged",
OBI_STR,
tuples=True,
comments=b"obi uniq merged ids",
alias=b"merged" # TODO what if it already exists
)
@ -410,10 +413,10 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
key = mergedKeys[k]
mkey = mergedKeys_m[k]
if key==b"taxid" and mergeIds:
if key==TAXID_COLUMN and mergeIds:
if b"taxid_dist" in i_seq:
taxid_dist_dict.update(i_taxid_dist_col[i_idx])
if b"taxid" in i_seq:
if TAXID_COLUMN in i_seq:
taxid_dist_dict[i_id] = i_taxid_col[i_idx]
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
@ -441,7 +444,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
mcol[key2] = mcol[key2] + i_mcol[key2]
# Write taxid_dist
if mergeIds and b"taxid" in mergedKeys:
if mergeIds and TAXID_COLUMN in mergedKeys:
if b"taxid_dist" in str_merged_cols:
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
else:
@ -524,6 +527,9 @@ def run(config):
except Exception, e:
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
pb(len(entries), force=True)
print("", file=sys.stderr)
# Save command config in View and DMS comments
command_line = " ".join(sys.argv[1:])
input_dms_name=[input[0].name]
@ -534,9 +540,11 @@ def run(config):
o_view.write_config(config, "uniq", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
output[0].record_command_line(command_line)
print("\n")
print(repr(o_view))
#print("\n\nOutput view:\n````````````", file=sys.stderr)
#print(repr(o_view), file=sys.stderr)
input[0].close()
output[0].close()
logger("info", "Done.")