obi uniq: various fixes...
This commit is contained in:
36
python/obitools3/commands/uniq.pyx
Executable file → Normal file
36
python/obitools3/commands/uniq.pyx
Executable file → Normal file
@ -7,7 +7,7 @@ from obitools3.dms.obiseq cimport Nuc_Seq_Stored
|
|||||||
from obitools3.dms.view import RollbackException
|
from obitools3.dms.view import RollbackException
|
||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
from obitools3.dms.column.column cimport Column, Column_line
|
from obitools3.dms.column.column cimport Column, Column_line
|
||||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
|
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN
|
||||||
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
|
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
|
||||||
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption, addTaxonomyOption
|
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption, addTaxonomyOption
|
||||||
from obitools3.uri.decode import open_uri
|
from obitools3.uri.decode import open_uri
|
||||||
@ -136,7 +136,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
if m_taxids_dict[k] is not None:
|
if m_taxids_dict[k] is not None:
|
||||||
m_taxids.append(int(k))
|
m_taxids.append(int(k))
|
||||||
taxid = taxonomy.last_common_taxon(*m_taxids)
|
taxid = taxonomy.last_common_taxon(*m_taxids)
|
||||||
seq[b"taxid"] = taxid
|
seq[TAXID_COLUMN] = taxid
|
||||||
tsp = taxonomy.get_species(taxid)
|
tsp = taxonomy.get_species(taxid)
|
||||||
tgn = taxonomy.get_genus(taxid)
|
tgn = taxonomy.get_genus(taxid)
|
||||||
tfa = taxonomy.get_family(taxid)
|
tfa = taxonomy.get_family(taxid)
|
||||||
@ -227,7 +227,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
mergedKeys_set=set()
|
mergedKeys_set=set()
|
||||||
|
|
||||||
if taxonomy is not None:
|
if taxonomy is not None:
|
||||||
mergedKeys_set.add(b"taxid")
|
mergedKeys_set.add(TAXID_COLUMN)
|
||||||
|
|
||||||
mergedKeys = list(mergedKeys_set)
|
mergedKeys = list(mergedKeys_set)
|
||||||
|
|
||||||
@ -243,8 +243,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
# Keep columns that are going to be used a lot in variables
|
# Keep columns that are going to be used a lot in variables
|
||||||
i_seq_col = view[NUC_SEQUENCE_COLUMN]
|
i_seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||||
i_id_col = view[ID_COLUMN]
|
i_id_col = view[ID_COLUMN]
|
||||||
if b"taxid" in view:
|
if TAXID_COLUMN in view:
|
||||||
i_taxid_col = view[b"taxid"]
|
i_taxid_col = view[TAXID_COLUMN]
|
||||||
if b"taxid_dist" in view:
|
if b"taxid_dist" in view:
|
||||||
i_taxid_dist_col = view[b"taxid_dist"]
|
i_taxid_dist_col = view[b"taxid_dist"]
|
||||||
|
|
||||||
@ -278,6 +278,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
for k in range(k_count):
|
for k in range(k_count):
|
||||||
key = mergedKeys[k]
|
key = mergedKeys[k]
|
||||||
mkey = mergedKeys_m[k]
|
mkey = mergedKeys_m[k]
|
||||||
|
# if mkey in i_seq: # TODO
|
||||||
|
# if mkey not in merged_infos:
|
||||||
|
# merged_infos[mkey] = {}
|
||||||
|
# mkey_infos = merged_infos[mkey]
|
||||||
|
# mkey_infos['nb_elts'] = 1
|
||||||
|
# mkey_infos['elt_names'] = [i_seq[key]]
|
||||||
if key in i_seq: # TODO what if mkey already in i_seq? --> should update
|
if key in i_seq: # TODO what if mkey already in i_seq? --> should update
|
||||||
if mkey not in merged_infos:
|
if mkey not in merged_infos:
|
||||||
merged_infos[mkey] = {}
|
merged_infos[mkey] = {}
|
||||||
@ -321,14 +327,13 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
mkey_cols[merged_col_name] = o_view[merged_col_name]
|
mkey_cols[merged_col_name] = o_view[merged_col_name]
|
||||||
|
|
||||||
# taxid_dist column
|
# taxid_dist column
|
||||||
if mergeIds and b"taxid" in mergedKeys:
|
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||||
if len(view) > max_elts: #The number of different IDs corresponds to the number of sequences in the view
|
if len(view) > max_elts: #The number of different IDs corresponds to the number of sequences in the view
|
||||||
str_merged_cols.append(b"taxid_dist")
|
str_merged_cols.append(b"taxid_dist")
|
||||||
Column.new_column(o_view,
|
Column.new_column(o_view,
|
||||||
b"taxid_dist",
|
b"taxid_dist",
|
||||||
OBI_STR,
|
OBI_STR,
|
||||||
to_eval=True,
|
to_eval=True,
|
||||||
comments=b"obi uniq taxid dist, stored as character strings to be read as dict",
|
|
||||||
alias=b"taxid_dist" # TODO what if it already exists
|
alias=b"taxid_dist" # TODO what if it already exists
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -337,7 +342,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
OBI_INT,
|
OBI_INT,
|
||||||
nb_elements_per_line=len(view),
|
nb_elements_per_line=len(view),
|
||||||
elements_names=[id for id in i_id_col],
|
elements_names=[id for id in i_id_col],
|
||||||
comments=b"obi uniq taxid dist",
|
|
||||||
alias=b"taxid_dist" # TODO what if it already exists
|
alias=b"taxid_dist" # TODO what if it already exists
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -350,7 +354,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
b"merged",
|
b"merged",
|
||||||
OBI_STR,
|
OBI_STR,
|
||||||
tuples=True,
|
tuples=True,
|
||||||
comments=b"obi uniq merged ids",
|
|
||||||
alias=b"merged" # TODO what if it already exists
|
alias=b"merged" # TODO what if it already exists
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -410,10 +413,10 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
key = mergedKeys[k]
|
key = mergedKeys[k]
|
||||||
mkey = mergedKeys_m[k]
|
mkey = mergedKeys_m[k]
|
||||||
|
|
||||||
if key==b"taxid" and mergeIds:
|
if key==TAXID_COLUMN and mergeIds:
|
||||||
if b"taxid_dist" in i_seq:
|
if b"taxid_dist" in i_seq:
|
||||||
taxid_dist_dict.update(i_taxid_dist_col[i_idx])
|
taxid_dist_dict.update(i_taxid_dist_col[i_idx])
|
||||||
if b"taxid" in i_seq:
|
if TAXID_COLUMN in i_seq:
|
||||||
taxid_dist_dict[i_id] = i_taxid_col[i_idx]
|
taxid_dist_dict[i_id] = i_taxid_col[i_idx]
|
||||||
|
|
||||||
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
||||||
@ -441,7 +444,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
mcol[key2] = mcol[key2] + i_mcol[key2]
|
mcol[key2] = mcol[key2] + i_mcol[key2]
|
||||||
|
|
||||||
# Write taxid_dist
|
# Write taxid_dist
|
||||||
if mergeIds and b"taxid" in mergedKeys:
|
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||||
if b"taxid_dist" in str_merged_cols:
|
if b"taxid_dist" in str_merged_cols:
|
||||||
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
||||||
else:
|
else:
|
||||||
@ -524,6 +527,9 @@ def run(config):
|
|||||||
except Exception, e:
|
except Exception, e:
|
||||||
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
|
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
|
||||||
|
|
||||||
|
pb(len(entries), force=True)
|
||||||
|
print("", file=sys.stderr)
|
||||||
|
|
||||||
# Save command config in View and DMS comments
|
# Save command config in View and DMS comments
|
||||||
command_line = " ".join(sys.argv[1:])
|
command_line = " ".join(sys.argv[1:])
|
||||||
input_dms_name=[input[0].name]
|
input_dms_name=[input[0].name]
|
||||||
@ -534,9 +540,11 @@ def run(config):
|
|||||||
o_view.write_config(config, "uniq", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
o_view.write_config(config, "uniq", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
||||||
output[0].record_command_line(command_line)
|
output[0].record_command_line(command_line)
|
||||||
|
|
||||||
print("\n")
|
#print("\n\nOutput view:\n````````````", file=sys.stderr)
|
||||||
print(repr(o_view))
|
#print(repr(o_view), file=sys.stderr)
|
||||||
|
|
||||||
input[0].close()
|
input[0].close()
|
||||||
output[0].close()
|
output[0].close()
|
||||||
|
|
||||||
|
logger("info", "Done.")
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user