obi uniq: added the option to merge ids, except it only works on small
sets until lists are implemented properly using obiblobs
This commit is contained in:
@ -6,7 +6,7 @@ from obitools3.dms.view.view cimport View, Line
|
|||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
from obitools3.dms.column.column cimport Column, Column_line
|
from obitools3.dms.column.column cimport Column, Column_line
|
||||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
|
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
|
||||||
from obitools3.dms.capi.obitypes cimport OBI_INT, index_t
|
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
|
||||||
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
|
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
|
||||||
from obitools3.uri.decode import open_uri
|
from obitools3.uri.decode import open_uri
|
||||||
from obitools3.apps.config import logger
|
from obitools3.apps.config import logger
|
||||||
@ -36,7 +36,7 @@ def addOptions(parser):
|
|||||||
group.add_argument('--merge-ids', '-e',
|
group.add_argument('--merge-ids', '-e',
|
||||||
action="store_true", dest="uniq:mergeids",
|
action="store_true", dest="uniq:mergeids",
|
||||||
default=False,
|
default=False,
|
||||||
help="Add the merged key with all ids of merged sequences.")
|
help="ONLY WORKING ON SMALL SETS FOR NOW Add the merged key with all ids of merged sequences.")
|
||||||
|
|
||||||
group.add_argument('--category-attribute', '-c',
|
group.add_argument('--category-attribute', '-c',
|
||||||
action="append", dest="uniq:categories",
|
action="append", dest="uniq:categories",
|
||||||
@ -55,8 +55,6 @@ def addOptions(parser):
|
|||||||
"sequences of any couple of groups are not the"
|
"sequences of any couple of groups are not the"
|
||||||
"prefix of the other one.")
|
"prefix of the other one.")
|
||||||
|
|
||||||
# TODO taxonomy
|
|
||||||
|
|
||||||
|
|
||||||
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
|
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
|
||||||
|
|
||||||
@ -81,6 +79,10 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
|
|||||||
cdef Column_line mcol
|
cdef Column_line mcol
|
||||||
cdef Column_line i_mcol
|
cdef Column_line i_mcol
|
||||||
cdef list catl
|
cdef list catl
|
||||||
|
cdef dict merged_ids_dict
|
||||||
|
|
||||||
|
cdef bytes k
|
||||||
|
cdef int n
|
||||||
|
|
||||||
#print(categories)
|
#print(categories)
|
||||||
|
|
||||||
@ -134,6 +136,7 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
|
|||||||
del(merged_infos)
|
del(merged_infos)
|
||||||
|
|
||||||
logger("info", "Second browsing through the input")
|
logger("info", "Second browsing through the input")
|
||||||
|
merged_ids_dict = {}
|
||||||
i = 0
|
i = 0
|
||||||
o_idx = 0
|
o_idx = 0
|
||||||
seq_col = view[NUC_SEQUENCE_COLUMN]
|
seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||||
@ -199,8 +202,9 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
|
|||||||
if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] :
|
if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] :
|
||||||
o_seq[key_b] = None
|
o_seq[key_b] = None
|
||||||
|
|
||||||
# if mergeIds: # TODO
|
if mergeIds :
|
||||||
# u_seq['merged'].append(i_seq.id)
|
merged_ids_dict[o_seq.id].append(i_seq.id)
|
||||||
|
#o_seq['merged'].append(i_seq.id)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
o_view[o_idx] = i_seq
|
o_view[o_idx] = i_seq
|
||||||
@ -229,10 +233,32 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
|
|||||||
mcol[to_merge] = o_seq[COUNT_COLUMN]
|
mcol[to_merge] = o_seq[COUNT_COLUMN]
|
||||||
o_seq[key] = None # TODO delete column eventually -> make C function?
|
o_seq[key] = None # TODO delete column eventually -> make C function?
|
||||||
|
|
||||||
# if mergeIds:
|
if mergeIds:
|
||||||
# u_seq['merged']=[o_seq.id]
|
merged_ids_dict[o_seq.id] = [o_seq.id] # TODO check that this id is added too in the original obiuniq
|
||||||
|
#o_seq['merged']=[o_seq.id]
|
||||||
|
|
||||||
i+=1
|
i+=1
|
||||||
|
|
||||||
|
# Merged ids column
|
||||||
|
if mergeIds :
|
||||||
|
nb_ids_max = 0
|
||||||
|
for k in merged_ids_dict :
|
||||||
|
n = len(merged_ids_dict[k])
|
||||||
|
if n > nb_ids_max :
|
||||||
|
nb_ids_max = n
|
||||||
|
|
||||||
|
Column.new_column(o_view,
|
||||||
|
"merged",
|
||||||
|
OBI_STR,
|
||||||
|
nb_elements_per_line=nb_ids_max,
|
||||||
|
elements_names=None,
|
||||||
|
comments="obi uniq merged ids",
|
||||||
|
alias="merged" # TODO what if it already exists
|
||||||
|
)
|
||||||
|
|
||||||
|
for o_seq in o_view:
|
||||||
|
o_seq['merged'] = merged_ids_dict[o_seq.id]
|
||||||
|
|
||||||
#TODO
|
#TODO
|
||||||
#if taxonomy is not None:
|
#if taxonomy is not None:
|
||||||
# mergeTaxonomyClassification(uniqSeq, taxonomy)
|
# mergeTaxonomyClassification(uniqSeq, taxonomy)
|
||||||
|
Reference in New Issue
Block a user