obi uniq: various improvements and fixes #66
This commit is contained in:
@ -7,12 +7,13 @@ from obitools3.dms.obiseq cimport Nuc_Seq_Stored
|
|||||||
from obitools3.dms.view import RollbackException
|
from obitools3.dms.view import RollbackException
|
||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
from obitools3.dms.column.column cimport Column, Column_line
|
from obitools3.dms.column.column cimport Column, Column_line
|
||||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN
|
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN, \
|
||||||
|
TAXID_DIST_COLUMN, MERGED_TAXID_COLUMN, MERGED_COLUMN, MERGED_PREFIX
|
||||||
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
|
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
|
||||||
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption, addTaxonomyOption
|
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption, addTaxonomyOption
|
||||||
from obitools3.uri.decode import open_uri
|
from obitools3.uri.decode import open_uri
|
||||||
from obitools3.apps.config import logger
|
from obitools3.apps.config import logger
|
||||||
from obitools3.utils cimport tobytes
|
from obitools3.utils cimport tobytes, tostr
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
@ -34,12 +35,12 @@ def addOptions(parser):
|
|||||||
metavar="<TAG NAME>",
|
metavar="<TAG NAME>",
|
||||||
default=[],
|
default=[],
|
||||||
type=str,
|
type=str,
|
||||||
help="Attributes to merge.") # TODO must be a 1 elt/line column
|
help="Attributes to merge.") # note: must be a 1 elt/line column, but columns containing already merged information (name MERGED_*) are automatically remerged
|
||||||
|
|
||||||
group.add_argument('--merge-ids', '-e',
|
group.add_argument('--merge-ids', '-e',
|
||||||
action="store_true", dest="uniq:mergeids",
|
action="store_true", dest="uniq:mergeids",
|
||||||
default=False,
|
default=False,
|
||||||
help="ONLY WORKING ON SMALL SETS FOR NOW Add the merged key with all ids of merged sequences.") # TODO ?
|
help="Add the merged key with all ids of merged sequences.")
|
||||||
|
|
||||||
group.add_argument('--category-attribute', '-c',
|
group.add_argument('--category-attribute', '-c',
|
||||||
action="append", dest="uniq:categories",
|
action="append", dest="uniq:categories",
|
||||||
@ -129,9 +130,9 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
)
|
)
|
||||||
|
|
||||||
for seq in o_view:
|
for seq in o_view:
|
||||||
if b"merged_taxid" in seq :
|
if MERGED_TAXID_COLUMN in seq :
|
||||||
m_taxids = []
|
m_taxids = []
|
||||||
m_taxids_dict = seq[b"merged_taxid"]
|
m_taxids_dict = seq[MERGED_TAXID_COLUMN]
|
||||||
for k in m_taxids_dict.keys() :
|
for k in m_taxids_dict.keys() :
|
||||||
if m_taxids_dict[k] is not None:
|
if m_taxids_dict[k] is not None:
|
||||||
m_taxids.append(int(k))
|
m_taxids.append(int(k))
|
||||||
@ -171,7 +172,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
seq[b"scientific_name"] = taxonomy.get_scientific_name(taxid)
|
seq[b"scientific_name"] = taxonomy.get_scientific_name(taxid)
|
||||||
|
|
||||||
|
|
||||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=10000) :
|
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
|
||||||
|
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef int k
|
cdef int k
|
||||||
@ -215,9 +216,18 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
cdef object iter_view
|
cdef object iter_view
|
||||||
cdef object mcol
|
cdef object mcol
|
||||||
cdef object to_merge
|
cdef object to_merge
|
||||||
|
cdef list merged_list
|
||||||
|
|
||||||
uniques = {}
|
uniques = {}
|
||||||
|
|
||||||
|
for column_name in view.keys():
|
||||||
|
if column_name[:7] == b"MERGED_":
|
||||||
|
info_to_merge = column_name[7:]
|
||||||
|
if mergedKeys_list is not None:
|
||||||
|
mergedKeys_list.append(tostr(info_to_merge))
|
||||||
|
else:
|
||||||
|
mergedKeys_list = [tostr(info_to_merge)]
|
||||||
|
|
||||||
mergedKeys_list_b = []
|
mergedKeys_list_b = []
|
||||||
if mergedKeys_list is not None:
|
if mergedKeys_list is not None:
|
||||||
for key_str in mergedKeys_list:
|
for key_str in mergedKeys_list:
|
||||||
@ -235,7 +245,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
|
|
||||||
mergedKeys_m = []
|
mergedKeys_m = []
|
||||||
for k in range(k_count):
|
for k in range(k_count):
|
||||||
mergedKeys_m.append(b"merged_" + mergedKeys[k])
|
mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k])
|
||||||
|
|
||||||
if categories is None:
|
if categories is None:
|
||||||
categories = []
|
categories = []
|
||||||
@ -245,8 +255,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
i_id_col = view[ID_COLUMN]
|
i_id_col = view[ID_COLUMN]
|
||||||
if TAXID_COLUMN in view:
|
if TAXID_COLUMN in view:
|
||||||
i_taxid_col = view[TAXID_COLUMN]
|
i_taxid_col = view[TAXID_COLUMN]
|
||||||
if b"taxid_dist" in view:
|
if TAXID_DIST_COLUMN in view:
|
||||||
i_taxid_dist_col = view[b"taxid_dist"]
|
i_taxid_dist_col = view[TAXID_DIST_COLUMN]
|
||||||
|
|
||||||
|
|
||||||
# First browsing
|
# First browsing
|
||||||
@ -278,13 +288,13 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
for k in range(k_count):
|
for k in range(k_count):
|
||||||
key = mergedKeys[k]
|
key = mergedKeys[k]
|
||||||
mkey = mergedKeys_m[k]
|
mkey = mergedKeys_m[k]
|
||||||
# if mkey in i_seq: # TODO
|
if mkey in i_seq:
|
||||||
# if mkey not in merged_infos:
|
if mkey not in merged_infos:
|
||||||
# merged_infos[mkey] = {}
|
merged_infos[mkey] = {}
|
||||||
# mkey_infos = merged_infos[mkey]
|
mkey_infos = merged_infos[mkey]
|
||||||
# mkey_infos['nb_elts'] = 1
|
mkey_infos['nb_elts'] = view[mkey].nb_elements_per_line
|
||||||
# mkey_infos['elt_names'] = [i_seq[key]]
|
mkey_infos['elt_names'] = view[mkey].elements_names
|
||||||
if key in i_seq: # TODO what if mkey already in i_seq? --> should update
|
if key in i_seq:
|
||||||
if mkey not in merged_infos:
|
if mkey not in merged_infos:
|
||||||
merged_infos[mkey] = {}
|
merged_infos[mkey] = {}
|
||||||
mkey_infos = merged_infos[mkey]
|
mkey_infos = merged_infos[mkey]
|
||||||
@ -304,6 +314,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
key = mergedKeys[k]
|
key = mergedKeys[k]
|
||||||
merged_col_name = mergedKeys_m[k]
|
merged_col_name = mergedKeys_m[k]
|
||||||
i_col = view[key]
|
i_col = view[key]
|
||||||
|
|
||||||
if merged_infos[merged_col_name]['nb_elts'] > max_elts:
|
if merged_infos[merged_col_name]['nb_elts'] > max_elts:
|
||||||
str_merged_cols.append(merged_col_name)
|
str_merged_cols.append(merged_col_name)
|
||||||
Column.new_column(o_view,
|
Column.new_column(o_view,
|
||||||
@ -311,7 +322,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
OBI_STR,
|
OBI_STR,
|
||||||
to_eval=True,
|
to_eval=True,
|
||||||
comments=i_col.comments,
|
comments=i_col.comments,
|
||||||
alias=merged_col_name # TODO what if it already exists
|
alias=merged_col_name
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -321,7 +332,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
|
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
|
||||||
elements_names=list(merged_infos[merged_col_name]['elt_names']),
|
elements_names=list(merged_infos[merged_col_name]['elt_names']),
|
||||||
comments=i_col.comments,
|
comments=i_col.comments,
|
||||||
alias=merged_col_name # TODO what if it already exists
|
alias=merged_col_name
|
||||||
)
|
)
|
||||||
|
|
||||||
mkey_cols[merged_col_name] = o_view[merged_col_name]
|
mkey_cols[merged_col_name] = o_view[merged_col_name]
|
||||||
@ -329,41 +340,39 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
# taxid_dist column
|
# taxid_dist column
|
||||||
if mergeIds and TAXID_COLUMN in mergedKeys:
|
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||||
if len(view) > max_elts: #The number of different IDs corresponds to the number of sequences in the view
|
if len(view) > max_elts: #The number of different IDs corresponds to the number of sequences in the view
|
||||||
str_merged_cols.append(b"taxid_dist")
|
str_merged_cols.append(TAXID_DIST_COLUMN)
|
||||||
Column.new_column(o_view,
|
Column.new_column(o_view,
|
||||||
b"taxid_dist",
|
TAXID_DIST_COLUMN,
|
||||||
OBI_STR,
|
OBI_STR,
|
||||||
to_eval=True,
|
to_eval=True,
|
||||||
alias=b"taxid_dist" # TODO what if it already exists
|
alias=TAXID_DIST_COLUMN
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
Column.new_column(o_view,
|
Column.new_column(o_view,
|
||||||
b"taxid_dist",
|
TAXID_DIST_COLUMN,
|
||||||
OBI_INT,
|
OBI_INT,
|
||||||
nb_elements_per_line=len(view),
|
nb_elements_per_line=len(view),
|
||||||
elements_names=[id for id in i_id_col],
|
elements_names=[id for id in i_id_col],
|
||||||
alias=b"taxid_dist" # TODO what if it already exists
|
alias=TAXID_DIST_COLUMN
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
del(merged_infos)
|
del(merged_infos)
|
||||||
|
|
||||||
# Merged ids column
|
# Merged ids column
|
||||||
if mergeIds :
|
if mergeIds :
|
||||||
Column.new_column(o_view,
|
Column.new_column(o_view,
|
||||||
b"merged",
|
MERGED_COLUMN,
|
||||||
OBI_STR,
|
OBI_STR,
|
||||||
tuples=True,
|
tuples=True,
|
||||||
alias=b"merged" # TODO what if it already exists
|
alias=MERGED_COLUMN
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# Keep columns that are going to be used a lot in variables
|
# Keep columns that are going to be used a lot in variables
|
||||||
o_id_col = o_view[ID_COLUMN]
|
o_id_col = o_view[ID_COLUMN]
|
||||||
if b"taxid_dist" in o_view:
|
if TAXID_DIST_COLUMN in o_view:
|
||||||
o_taxid_dist_col = o_view[b"taxid_dist"]
|
o_taxid_dist_col = o_view[TAXID_DIST_COLUMN]
|
||||||
if b"merged" in o_view:
|
if MERGED_COLUMN in o_view:
|
||||||
o_merged_col = o_view[b"merged"]
|
o_merged_col = o_view[MERGED_COLUMN]
|
||||||
|
|
||||||
print("\n") # TODO because in the middle of progress bar. Better solution?
|
print("\n") # TODO because in the middle of progress bar. Better solution?
|
||||||
logger("info", "Second browsing through the input")
|
logger("info", "Second browsing through the input")
|
||||||
@ -383,11 +392,15 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
o_id = o_seq.id
|
o_id = o_seq.id
|
||||||
|
|
||||||
if mergeIds:
|
if mergeIds:
|
||||||
o_merged_col[o_idx] = [view[idx].id for idx in merged_sequences]
|
merged_list = [view[idx].id for idx in merged_sequences]
|
||||||
|
if MERGED_COLUMN in view: # merge all ids if there's already some merged ids info
|
||||||
|
merged_list.extend(view[MERGED_COLUMN][idx] for idx in merged_sequences)
|
||||||
|
merged_list = list(set(merged_list)) # deduplicate the list
|
||||||
|
o_merged_col[o_idx] = merged_list
|
||||||
|
|
||||||
o_seq[COUNT_COLUMN] = 0
|
o_seq[COUNT_COLUMN] = 0
|
||||||
|
|
||||||
if b"taxid_dist" in u_seq and i_taxid_dist_col[u_idx] is not None:
|
if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None:
|
||||||
taxid_dist_dict = i_taxid_dist_col[u_idx]
|
taxid_dist_dict = i_taxid_dist_col[u_idx]
|
||||||
else:
|
else:
|
||||||
taxid_dist_dict = {}
|
taxid_dist_dict = {}
|
||||||
@ -414,12 +427,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
mkey = mergedKeys_m[k]
|
mkey = mergedKeys_m[k]
|
||||||
|
|
||||||
if key==TAXID_COLUMN and mergeIds:
|
if key==TAXID_COLUMN and mergeIds:
|
||||||
if b"taxid_dist" in i_seq:
|
if TAXID_DIST_COLUMN in i_seq:
|
||||||
taxid_dist_dict.update(i_taxid_dist_col[i_idx])
|
taxid_dist_dict.update(i_taxid_dist_col[i_idx])
|
||||||
if TAXID_COLUMN in i_seq:
|
if TAXID_COLUMN in i_seq:
|
||||||
taxid_dist_dict[i_id] = i_taxid_col[i_idx]
|
taxid_dist_dict[i_id] = i_taxid_col[i_idx]
|
||||||
|
|
||||||
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
# merge relevant keys
|
||||||
if key in i_seq:
|
if key in i_seq:
|
||||||
to_merge = i_seq[key]
|
to_merge = i_seq[key]
|
||||||
if to_merge is not None:
|
if to_merge is not None:
|
||||||
@ -431,21 +444,20 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
else:
|
else:
|
||||||
mcol[to_merge] = mcol[to_merge] + i_count
|
mcol[to_merge] = mcol[to_merge] + i_count
|
||||||
o_seq[key] = None
|
o_seq[key] = None
|
||||||
#cas ou merged_keys existe deja
|
# merged infos already in seq: merge the merged infos
|
||||||
else: # TODO is this a good else
|
if mkey in i_seq:
|
||||||
if mkey in i_seq:
|
mcol = merged_dict[mkey] # dict
|
||||||
mcol = merged_dict[mkey]
|
i_mcol = i_seq[mkey] # column line
|
||||||
i_mcol = i_seq[mkey]
|
if i_mcol.is_NA() == False:
|
||||||
if i_mcol is not None:
|
for key2 in i_mcol:
|
||||||
for key2 in i_mcol:
|
if key2 not in mcol:
|
||||||
if mcol[key2] is None:
|
mcol[key2] = i_mcol[key2]
|
||||||
mcol[key2] = i_mcol[key2]
|
else:
|
||||||
else:
|
mcol[key2] = mcol[key2] + i_mcol[key2]
|
||||||
mcol[key2] = mcol[key2] + i_mcol[key2]
|
|
||||||
|
|
||||||
# Write taxid_dist
|
# Write taxid_dist
|
||||||
if mergeIds and TAXID_COLUMN in mergedKeys:
|
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||||
if b"taxid_dist" in str_merged_cols:
|
if TAXID_DIST_COLUMN in str_merged_cols:
|
||||||
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
||||||
else:
|
else:
|
||||||
o_taxid_dist_col[o_idx] = taxid_dist_dict
|
o_taxid_dist_col[o_idx] = taxid_dist_dict
|
||||||
@ -456,7 +468,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
|
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
|
||||||
else:
|
else:
|
||||||
mkey_cols[mkey][o_idx] = merged_dict[mkey]
|
mkey_cols[mkey][o_idx] = merged_dict[mkey]
|
||||||
# Sets NA values to 0 # TODO discuss, maybe keep as None and test for None instead of testing for 0 in tools
|
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
|
||||||
#for key in mkey_cols[mkey][o_idx]:
|
#for key in mkey_cols[mkey][o_idx]:
|
||||||
# if mkey_cols[mkey][o_idx][key] is None:
|
# if mkey_cols[mkey][o_idx][key] is None:
|
||||||
# mkey_cols[mkey][o_idx][key] = 0
|
# mkey_cols[mkey][o_idx][key] = 0
|
||||||
@ -464,7 +476,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
for key in i_seq.keys():
|
for key in i_seq.keys():
|
||||||
# Delete informations that differ between the merged sequences
|
# Delete informations that differ between the merged sequences
|
||||||
# TODO make special columns list?
|
# TODO make special columns list?
|
||||||
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] :
|
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
|
||||||
|
and key not in merged_dict :
|
||||||
o_seq[key] = None
|
o_seq[key] = None
|
||||||
|
|
||||||
o_idx += 1
|
o_idx += 1
|
||||||
|
@ -26,6 +26,10 @@ cdef extern from "obiview.h" nogil:
|
|||||||
extern const_char_p QUALITY_COLUMN
|
extern const_char_p QUALITY_COLUMN
|
||||||
extern const_char_p COUNT_COLUMN
|
extern const_char_p COUNT_COLUMN
|
||||||
extern const_char_p TAXID_COLUMN
|
extern const_char_p TAXID_COLUMN
|
||||||
|
extern const_char_p MERGED_TAXID_COLUMN
|
||||||
|
extern const_char_p MERGED_PREFIX
|
||||||
|
extern const_char_p TAXID_DIST_COLUMN
|
||||||
|
extern const_char_p MERGED_COLUMN
|
||||||
|
|
||||||
struct Alias_column_pair_t :
|
struct Alias_column_pair_t :
|
||||||
Column_reference_t column_refs
|
Column_reference_t column_refs
|
||||||
|
@ -57,9 +57,14 @@
|
|||||||
*/
|
*/
|
||||||
#define TAXID_COLUMN "TAXID" /**< The name of the column containing the taxids. TODO subtype of INT column?
|
#define TAXID_COLUMN "TAXID" /**< The name of the column containing the taxids. TODO subtype of INT column?
|
||||||
*/
|
*/
|
||||||
#define MERGED_TAXID_COLUMN "merged_TAXID" /**< The name of the column containing the merged taxids information.
|
#define MERGED_TAXID_COLUMN "MERGED_TAXID" /**< The name of the column containing the merged taxids information.
|
||||||
*/ // TODO maybe mix of lower and uppercase == bad
|
*/
|
||||||
|
#define MERGED_PREFIX "MERGED_" /**< The prefix to prepend to column names when merging informations during obi uniq.
|
||||||
|
*/
|
||||||
|
#define TAXID_DIST_COLUMN "TAXID_DIST" /**< The name of the column containing a dictionary of taxid:[list of ids] when merging informations during obi uniq.
|
||||||
|
*/
|
||||||
|
#define MERGED_COLUMN "MERGED" /**< The name of the column containing a list of ids when merging informations during obi uniq.
|
||||||
|
*/
|
||||||
#define ID_PREFIX "seq" /**< The default prefix of sequence identifiers in automatic ID columns.
|
#define ID_PREFIX "seq" /**< The default prefix of sequence identifiers in automatic ID columns.
|
||||||
*/
|
*/
|
||||||
#define PREDICATE_KEY "predicates" /**< The key used in the json-formatted view comments to store predicates.
|
#define PREDICATE_KEY "predicates" /**< The key used in the json-formatted view comments to store predicates.
|
||||||
|
Reference in New Issue
Block a user