obi uniq: various improvements and fixes #66
This commit is contained in:
@ -7,12 +7,13 @@ from obitools3.dms.obiseq cimport Nuc_Seq_Stored
|
||||
from obitools3.dms.view import RollbackException
|
||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||
from obitools3.dms.column.column cimport Column, Column_line
|
||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN
|
||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN, \
|
||||
TAXID_DIST_COLUMN, MERGED_TAXID_COLUMN, MERGED_COLUMN, MERGED_PREFIX
|
||||
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
|
||||
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption, addTaxonomyOption
|
||||
from obitools3.uri.decode import open_uri
|
||||
from obitools3.apps.config import logger
|
||||
from obitools3.utils cimport tobytes
|
||||
from obitools3.utils cimport tobytes, tostr
|
||||
|
||||
import sys
|
||||
|
||||
@ -34,12 +35,12 @@ def addOptions(parser):
|
||||
metavar="<TAG NAME>",
|
||||
default=[],
|
||||
type=str,
|
||||
help="Attributes to merge.") # TODO must be a 1 elt/line column
|
||||
help="Attributes to merge.") # note: must be a 1 elt/line column, but columns containing already merged information (name MERGED_*) are automatically remerged
|
||||
|
||||
group.add_argument('--merge-ids', '-e',
|
||||
action="store_true", dest="uniq:mergeids",
|
||||
default=False,
|
||||
help="ONLY WORKING ON SMALL SETS FOR NOW Add the merged key with all ids of merged sequences.") # TODO ?
|
||||
help="Add the merged key with all ids of merged sequences.")
|
||||
|
||||
group.add_argument('--category-attribute', '-c',
|
||||
action="append", dest="uniq:categories",
|
||||
@ -129,9 +130,9 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
)
|
||||
|
||||
for seq in o_view:
|
||||
if b"merged_taxid" in seq :
|
||||
if MERGED_TAXID_COLUMN in seq :
|
||||
m_taxids = []
|
||||
m_taxids_dict = seq[b"merged_taxid"]
|
||||
m_taxids_dict = seq[MERGED_TAXID_COLUMN]
|
||||
for k in m_taxids_dict.keys() :
|
||||
if m_taxids_dict[k] is not None:
|
||||
m_taxids.append(int(k))
|
||||
@ -171,7 +172,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
seq[b"scientific_name"] = taxonomy.get_scientific_name(taxid)
|
||||
|
||||
|
||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=10000) :
|
||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
|
||||
|
||||
cdef int i
|
||||
cdef int k
|
||||
@ -215,9 +216,18 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
cdef object iter_view
|
||||
cdef object mcol
|
||||
cdef object to_merge
|
||||
cdef list merged_list
|
||||
|
||||
uniques = {}
|
||||
|
||||
for column_name in view.keys():
|
||||
if column_name[:7] == b"MERGED_":
|
||||
info_to_merge = column_name[7:]
|
||||
if mergedKeys_list is not None:
|
||||
mergedKeys_list.append(tostr(info_to_merge))
|
||||
else:
|
||||
mergedKeys_list = [tostr(info_to_merge)]
|
||||
|
||||
mergedKeys_list_b = []
|
||||
if mergedKeys_list is not None:
|
||||
for key_str in mergedKeys_list:
|
||||
@ -235,7 +245,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
|
||||
mergedKeys_m = []
|
||||
for k in range(k_count):
|
||||
mergedKeys_m.append(b"merged_" + mergedKeys[k])
|
||||
mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k])
|
||||
|
||||
if categories is None:
|
||||
categories = []
|
||||
@ -245,8 +255,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
i_id_col = view[ID_COLUMN]
|
||||
if TAXID_COLUMN in view:
|
||||
i_taxid_col = view[TAXID_COLUMN]
|
||||
if b"taxid_dist" in view:
|
||||
i_taxid_dist_col = view[b"taxid_dist"]
|
||||
if TAXID_DIST_COLUMN in view:
|
||||
i_taxid_dist_col = view[TAXID_DIST_COLUMN]
|
||||
|
||||
|
||||
# First browsing
|
||||
@ -278,13 +288,13 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
for k in range(k_count):
|
||||
key = mergedKeys[k]
|
||||
mkey = mergedKeys_m[k]
|
||||
# if mkey in i_seq: # TODO
|
||||
# if mkey not in merged_infos:
|
||||
# merged_infos[mkey] = {}
|
||||
# mkey_infos = merged_infos[mkey]
|
||||
# mkey_infos['nb_elts'] = 1
|
||||
# mkey_infos['elt_names'] = [i_seq[key]]
|
||||
if key in i_seq: # TODO what if mkey already in i_seq? --> should update
|
||||
if mkey in i_seq:
|
||||
if mkey not in merged_infos:
|
||||
merged_infos[mkey] = {}
|
||||
mkey_infos = merged_infos[mkey]
|
||||
mkey_infos['nb_elts'] = view[mkey].nb_elements_per_line
|
||||
mkey_infos['elt_names'] = view[mkey].elements_names
|
||||
if key in i_seq:
|
||||
if mkey not in merged_infos:
|
||||
merged_infos[mkey] = {}
|
||||
mkey_infos = merged_infos[mkey]
|
||||
@ -304,6 +314,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
key = mergedKeys[k]
|
||||
merged_col_name = mergedKeys_m[k]
|
||||
i_col = view[key]
|
||||
|
||||
if merged_infos[merged_col_name]['nb_elts'] > max_elts:
|
||||
str_merged_cols.append(merged_col_name)
|
||||
Column.new_column(o_view,
|
||||
@ -311,7 +322,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
OBI_STR,
|
||||
to_eval=True,
|
||||
comments=i_col.comments,
|
||||
alias=merged_col_name # TODO what if it already exists
|
||||
alias=merged_col_name
|
||||
)
|
||||
|
||||
else:
|
||||
@ -321,7 +332,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
|
||||
elements_names=list(merged_infos[merged_col_name]['elt_names']),
|
||||
comments=i_col.comments,
|
||||
alias=merged_col_name # TODO what if it already exists
|
||||
alias=merged_col_name
|
||||
)
|
||||
|
||||
mkey_cols[merged_col_name] = o_view[merged_col_name]
|
||||
@ -329,41 +340,39 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
# taxid_dist column
|
||||
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||
if len(view) > max_elts: #The number of different IDs corresponds to the number of sequences in the view
|
||||
str_merged_cols.append(b"taxid_dist")
|
||||
str_merged_cols.append(TAXID_DIST_COLUMN)
|
||||
Column.new_column(o_view,
|
||||
b"taxid_dist",
|
||||
TAXID_DIST_COLUMN,
|
||||
OBI_STR,
|
||||
to_eval=True,
|
||||
alias=b"taxid_dist" # TODO what if it already exists
|
||||
alias=TAXID_DIST_COLUMN
|
||||
)
|
||||
else:
|
||||
Column.new_column(o_view,
|
||||
b"taxid_dist",
|
||||
TAXID_DIST_COLUMN,
|
||||
OBI_INT,
|
||||
nb_elements_per_line=len(view),
|
||||
elements_names=[id for id in i_id_col],
|
||||
alias=b"taxid_dist" # TODO what if it already exists
|
||||
alias=TAXID_DIST_COLUMN
|
||||
)
|
||||
|
||||
|
||||
del(merged_infos)
|
||||
|
||||
# Merged ids column
|
||||
if mergeIds :
|
||||
Column.new_column(o_view,
|
||||
b"merged",
|
||||
MERGED_COLUMN,
|
||||
OBI_STR,
|
||||
tuples=True,
|
||||
alias=b"merged" # TODO what if it already exists
|
||||
alias=MERGED_COLUMN
|
||||
)
|
||||
|
||||
|
||||
# Keep columns that are going to be used a lot in variables
|
||||
o_id_col = o_view[ID_COLUMN]
|
||||
if b"taxid_dist" in o_view:
|
||||
o_taxid_dist_col = o_view[b"taxid_dist"]
|
||||
if b"merged" in o_view:
|
||||
o_merged_col = o_view[b"merged"]
|
||||
if TAXID_DIST_COLUMN in o_view:
|
||||
o_taxid_dist_col = o_view[TAXID_DIST_COLUMN]
|
||||
if MERGED_COLUMN in o_view:
|
||||
o_merged_col = o_view[MERGED_COLUMN]
|
||||
|
||||
print("\n") # TODO because in the middle of progress bar. Better solution?
|
||||
logger("info", "Second browsing through the input")
|
||||
@ -383,11 +392,15 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
o_id = o_seq.id
|
||||
|
||||
if mergeIds:
|
||||
o_merged_col[o_idx] = [view[idx].id for idx in merged_sequences]
|
||||
merged_list = [view[idx].id for idx in merged_sequences]
|
||||
if MERGED_COLUMN in view: # merge all ids if there's already some merged ids info
|
||||
merged_list.extend(view[MERGED_COLUMN][idx] for idx in merged_sequences)
|
||||
merged_list = list(set(merged_list)) # deduplicate the list
|
||||
o_merged_col[o_idx] = merged_list
|
||||
|
||||
o_seq[COUNT_COLUMN] = 0
|
||||
|
||||
if b"taxid_dist" in u_seq and i_taxid_dist_col[u_idx] is not None:
|
||||
if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None:
|
||||
taxid_dist_dict = i_taxid_dist_col[u_idx]
|
||||
else:
|
||||
taxid_dist_dict = {}
|
||||
@ -414,12 +427,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
mkey = mergedKeys_m[k]
|
||||
|
||||
if key==TAXID_COLUMN and mergeIds:
|
||||
if b"taxid_dist" in i_seq:
|
||||
if TAXID_DIST_COLUMN in i_seq:
|
||||
taxid_dist_dict.update(i_taxid_dist_col[i_idx])
|
||||
if TAXID_COLUMN in i_seq:
|
||||
taxid_dist_dict[i_id] = i_taxid_col[i_idx]
|
||||
|
||||
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
||||
# merge relevant keys
|
||||
if key in i_seq:
|
||||
to_merge = i_seq[key]
|
||||
if to_merge is not None:
|
||||
@ -431,21 +444,20 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
else:
|
||||
mcol[to_merge] = mcol[to_merge] + i_count
|
||||
o_seq[key] = None
|
||||
#cas ou merged_keys existe deja
|
||||
else: # TODO is this a good else
|
||||
# merged infos already in seq: merge the merged infos
|
||||
if mkey in i_seq:
|
||||
mcol = merged_dict[mkey]
|
||||
i_mcol = i_seq[mkey]
|
||||
if i_mcol is not None:
|
||||
mcol = merged_dict[mkey] # dict
|
||||
i_mcol = i_seq[mkey] # column line
|
||||
if i_mcol.is_NA() == False:
|
||||
for key2 in i_mcol:
|
||||
if mcol[key2] is None:
|
||||
if key2 not in mcol:
|
||||
mcol[key2] = i_mcol[key2]
|
||||
else:
|
||||
mcol[key2] = mcol[key2] + i_mcol[key2]
|
||||
|
||||
# Write taxid_dist
|
||||
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||
if b"taxid_dist" in str_merged_cols:
|
||||
if TAXID_DIST_COLUMN in str_merged_cols:
|
||||
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
||||
else:
|
||||
o_taxid_dist_col[o_idx] = taxid_dist_dict
|
||||
@ -456,7 +468,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
|
||||
else:
|
||||
mkey_cols[mkey][o_idx] = merged_dict[mkey]
|
||||
# Sets NA values to 0 # TODO discuss, maybe keep as None and test for None instead of testing for 0 in tools
|
||||
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
|
||||
#for key in mkey_cols[mkey][o_idx]:
|
||||
# if mkey_cols[mkey][o_idx][key] is None:
|
||||
# mkey_cols[mkey][o_idx][key] = 0
|
||||
@ -464,7 +476,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
for key in i_seq.keys():
|
||||
# Delete informations that differ between the merged sequences
|
||||
# TODO make special columns list?
|
||||
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] :
|
||||
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
|
||||
and key not in merged_dict :
|
||||
o_seq[key] = None
|
||||
|
||||
o_idx += 1
|
||||
|
@ -26,6 +26,10 @@ cdef extern from "obiview.h" nogil:
|
||||
extern const_char_p QUALITY_COLUMN
|
||||
extern const_char_p COUNT_COLUMN
|
||||
extern const_char_p TAXID_COLUMN
|
||||
extern const_char_p MERGED_TAXID_COLUMN
|
||||
extern const_char_p MERGED_PREFIX
|
||||
extern const_char_p TAXID_DIST_COLUMN
|
||||
extern const_char_p MERGED_COLUMN
|
||||
|
||||
struct Alias_column_pair_t :
|
||||
Column_reference_t column_refs
|
||||
|
@ -57,9 +57,14 @@
|
||||
*/
|
||||
#define TAXID_COLUMN "TAXID" /**< The name of the column containing the taxids. TODO subtype of INT column?
|
||||
*/
|
||||
#define MERGED_TAXID_COLUMN "merged_TAXID" /**< The name of the column containing the merged taxids information.
|
||||
*/ // TODO maybe mix of lower and uppercase == bad
|
||||
|
||||
#define MERGED_TAXID_COLUMN "MERGED_TAXID" /**< The name of the column containing the merged taxids information.
|
||||
*/
|
||||
#define MERGED_PREFIX "MERGED_" /**< The prefix to prepend to column names when merging informations during obi uniq.
|
||||
*/
|
||||
#define TAXID_DIST_COLUMN "TAXID_DIST" /**< The name of the column containing a dictionary of taxid:[list of ids] when merging informations during obi uniq.
|
||||
*/
|
||||
#define MERGED_COLUMN "MERGED" /**< The name of the column containing a list of ids when merging informations during obi uniq.
|
||||
*/
|
||||
#define ID_PREFIX "seq" /**< The default prefix of sequence identifiers in automatic ID columns.
|
||||
*/
|
||||
#define PREDICATE_KEY "predicates" /**< The key used in the json-formatted view comments to store predicates.
|
||||
|
Reference in New Issue
Block a user