obi uniq: various fixes...
This commit is contained in:
36
python/obitools3/commands/uniq.pyx
Executable file → Normal file
36
python/obitools3/commands/uniq.pyx
Executable file → Normal file
@ -7,7 +7,7 @@ from obitools3.dms.obiseq cimport Nuc_Seq_Stored
|
||||
from obitools3.dms.view import RollbackException
|
||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||
from obitools3.dms.column.column cimport Column, Column_line
|
||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
|
||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN
|
||||
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
|
||||
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption, addTaxonomyOption
|
||||
from obitools3.uri.decode import open_uri
|
||||
@ -136,7 +136,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
if m_taxids_dict[k] is not None:
|
||||
m_taxids.append(int(k))
|
||||
taxid = taxonomy.last_common_taxon(*m_taxids)
|
||||
seq[b"taxid"] = taxid
|
||||
seq[TAXID_COLUMN] = taxid
|
||||
tsp = taxonomy.get_species(taxid)
|
||||
tgn = taxonomy.get_genus(taxid)
|
||||
tfa = taxonomy.get_family(taxid)
|
||||
@ -227,7 +227,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
mergedKeys_set=set()
|
||||
|
||||
if taxonomy is not None:
|
||||
mergedKeys_set.add(b"taxid")
|
||||
mergedKeys_set.add(TAXID_COLUMN)
|
||||
|
||||
mergedKeys = list(mergedKeys_set)
|
||||
|
||||
@ -243,8 +243,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
# Keep columns that are going to be used a lot in variables
|
||||
i_seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||
i_id_col = view[ID_COLUMN]
|
||||
if b"taxid" in view:
|
||||
i_taxid_col = view[b"taxid"]
|
||||
if TAXID_COLUMN in view:
|
||||
i_taxid_col = view[TAXID_COLUMN]
|
||||
if b"taxid_dist" in view:
|
||||
i_taxid_dist_col = view[b"taxid_dist"]
|
||||
|
||||
@ -278,6 +278,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
for k in range(k_count):
|
||||
key = mergedKeys[k]
|
||||
mkey = mergedKeys_m[k]
|
||||
# if mkey in i_seq: # TODO
|
||||
# if mkey not in merged_infos:
|
||||
# merged_infos[mkey] = {}
|
||||
# mkey_infos = merged_infos[mkey]
|
||||
# mkey_infos['nb_elts'] = 1
|
||||
# mkey_infos['elt_names'] = [i_seq[key]]
|
||||
if key in i_seq: # TODO what if mkey already in i_seq? --> should update
|
||||
if mkey not in merged_infos:
|
||||
merged_infos[mkey] = {}
|
||||
@ -321,14 +327,13 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
mkey_cols[merged_col_name] = o_view[merged_col_name]
|
||||
|
||||
# taxid_dist column
|
||||
if mergeIds and b"taxid" in mergedKeys:
|
||||
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||
if len(view) > max_elts: #The number of different IDs corresponds to the number of sequences in the view
|
||||
str_merged_cols.append(b"taxid_dist")
|
||||
Column.new_column(o_view,
|
||||
b"taxid_dist",
|
||||
OBI_STR,
|
||||
to_eval=True,
|
||||
comments=b"obi uniq taxid dist, stored as character strings to be read as dict",
|
||||
alias=b"taxid_dist" # TODO what if it already exists
|
||||
)
|
||||
else:
|
||||
@ -337,7 +342,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
OBI_INT,
|
||||
nb_elements_per_line=len(view),
|
||||
elements_names=[id for id in i_id_col],
|
||||
comments=b"obi uniq taxid dist",
|
||||
alias=b"taxid_dist" # TODO what if it already exists
|
||||
)
|
||||
|
||||
@ -350,7 +354,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
b"merged",
|
||||
OBI_STR,
|
||||
tuples=True,
|
||||
comments=b"obi uniq merged ids",
|
||||
alias=b"merged" # TODO what if it already exists
|
||||
)
|
||||
|
||||
@ -410,10 +413,10 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
key = mergedKeys[k]
|
||||
mkey = mergedKeys_m[k]
|
||||
|
||||
if key==b"taxid" and mergeIds:
|
||||
if key==TAXID_COLUMN and mergeIds:
|
||||
if b"taxid_dist" in i_seq:
|
||||
taxid_dist_dict.update(i_taxid_dist_col[i_idx])
|
||||
if b"taxid" in i_seq:
|
||||
if TAXID_COLUMN in i_seq:
|
||||
taxid_dist_dict[i_id] = i_taxid_col[i_idx]
|
||||
|
||||
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
||||
@ -441,7 +444,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
mcol[key2] = mcol[key2] + i_mcol[key2]
|
||||
|
||||
# Write taxid_dist
|
||||
if mergeIds and b"taxid" in mergedKeys:
|
||||
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||
if b"taxid_dist" in str_merged_cols:
|
||||
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
||||
else:
|
||||
@ -524,6 +527,9 @@ def run(config):
|
||||
except Exception, e:
|
||||
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
|
||||
|
||||
pb(len(entries), force=True)
|
||||
print("", file=sys.stderr)
|
||||
|
||||
# Save command config in View and DMS comments
|
||||
command_line = " ".join(sys.argv[1:])
|
||||
input_dms_name=[input[0].name]
|
||||
@ -534,9 +540,11 @@ def run(config):
|
||||
o_view.write_config(config, "uniq", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
||||
output[0].record_command_line(command_line)
|
||||
|
||||
print("\n")
|
||||
print(repr(o_view))
|
||||
#print("\n\nOutput view:\n````````````", file=sys.stderr)
|
||||
#print(repr(o_view), file=sys.stderr)
|
||||
|
||||
input[0].close()
|
||||
output[0].close()
|
||||
|
||||
logger("info", "Done.")
|
||||
|
||||
|
Reference in New Issue
Block a user