obi uniq: added option to use categories additionally to the sequence to
determine uniqueness
This commit is contained in:
@ -5,4 +5,4 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
|
|||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
|
|
||||||
|
|
||||||
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*)
|
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||||
from obitools3.dms.dms cimport DMS
|
from obitools3.dms.dms cimport DMS
|
||||||
from obitools3.dms.taxo.taxo cimport Taxonomy
|
|
||||||
from obitools3.dms.view.view cimport View, Line
|
from obitools3.dms.view.view cimport View, Line
|
||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
from obitools3.dms.column.column cimport Column, Column_line
|
from obitools3.dms.column.column cimport Column, Column_line
|
||||||
@ -59,12 +58,12 @@ def addOptions(parser):
|
|||||||
# TODO taxonomy
|
# TODO taxonomy
|
||||||
|
|
||||||
|
|
||||||
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) :
|
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
|
||||||
|
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef int o_idx
|
cdef int o_idx
|
||||||
cdef int u_idx
|
cdef int u_idx
|
||||||
cdef int u_id
|
cdef tuple u_id
|
||||||
cdef int i_count
|
cdef int i_count
|
||||||
cdef set mergedKeys
|
cdef set mergedKeys
|
||||||
cdef dict uniques
|
cdef dict uniques
|
||||||
@ -80,23 +79,26 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxo
|
|||||||
cdef Column seq_col
|
cdef Column seq_col
|
||||||
cdef object to_merge
|
cdef object to_merge
|
||||||
cdef Column_line mcol
|
cdef Column_line mcol
|
||||||
cdef Column_line i_mcol
|
cdef Column_line i_mcol
|
||||||
|
cdef list catl
|
||||||
|
|
||||||
|
#print(categories)
|
||||||
|
|
||||||
uniques = {}
|
uniques = {}
|
||||||
|
|
||||||
if categories is None:
|
|
||||||
categories=[]
|
|
||||||
|
|
||||||
if mergedKeys_list is not None:
|
if mergedKeys_list is not None:
|
||||||
mergedKeys=set(mergedKeys_list)
|
mergedKeys=set(mergedKeys_list)
|
||||||
else:
|
else:
|
||||||
mergedKeys=set()
|
mergedKeys=set()
|
||||||
|
|
||||||
# if taxonomy is not None:
|
if taxonomy is not None:
|
||||||
# mergedKeys.add('taxid')
|
mergedKeys.add('taxid')
|
||||||
|
|
||||||
|
if categories is None:
|
||||||
|
categories = []
|
||||||
|
|
||||||
# Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
|
# Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
|
||||||
#logger("info", "obi uniq", "First browsing through the input")
|
logger("info", "First browsing through the input")
|
||||||
merged_infos = {}
|
merged_infos = {}
|
||||||
i = 0
|
i = 0
|
||||||
iter_view = iter(view)
|
iter_view = iter(view)
|
||||||
@ -131,20 +133,28 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxo
|
|||||||
|
|
||||||
del(merged_infos)
|
del(merged_infos)
|
||||||
|
|
||||||
#logger("info", "obi uniq", "Second browsing through the input")
|
logger("info", "Second browsing through the input")
|
||||||
i = 0
|
i = 0
|
||||||
o_idx = 0
|
o_idx = 0
|
||||||
seq_col = view[NUC_SEQUENCE_COLUMN]
|
seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||||
|
|
||||||
iter_view = iter(view)
|
iter_view = iter(view)
|
||||||
for i_seq in iter_view :
|
for i_seq in iter_view :
|
||||||
pb(i)
|
pb(i)
|
||||||
|
|
||||||
#u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)
|
# This can't be done in the same line as the u_id tuple creation because it generates a bug
|
||||||
u_id = seq_col.get_line_idx(i)
|
# where Cython (version 0.25.2) does not detect the reference to the categs_list variable and deallocates
|
||||||
|
# it at the beginning of the function.
|
||||||
|
# (Only happens if categs_list is an optional parameter, which it is).
|
||||||
|
catl = []
|
||||||
|
for x in categories :
|
||||||
|
catl.append(i_seq[x])
|
||||||
|
|
||||||
|
u_id = tuple(catl) + (seq_col.get_line_idx(i),)
|
||||||
|
#u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),) # The line that cython can't read properly
|
||||||
|
|
||||||
if u_id in uniques:
|
if u_id in uniques:
|
||||||
|
|
||||||
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
|
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
|
||||||
i_count = 1
|
i_count = 1
|
||||||
else:
|
else:
|
||||||
@ -248,9 +258,11 @@ def run(config):
|
|||||||
input=False,
|
input=False,
|
||||||
newviewtype=View_NUC_SEQS)
|
newviewtype=View_NUC_SEQS)
|
||||||
|
|
||||||
|
# TODO exceptions not handled like they should be
|
||||||
|
|
||||||
entries = input[1]
|
entries = input[1]
|
||||||
o_view = output[1]
|
o_view = output[1]
|
||||||
|
|
||||||
# Initialize the progress bar
|
# Initialize the progress bar
|
||||||
pb = ProgressBar(len(entries), config, seconde=5)
|
pb = ProgressBar(len(entries), config, seconde=5)
|
||||||
|
|
||||||
@ -259,9 +271,14 @@ def run(config):
|
|||||||
# usm = uniqPrefixSequence
|
# usm = uniqPrefixSequence
|
||||||
# else:
|
# else:
|
||||||
usm = uniqSequence
|
usm = uniqSequence
|
||||||
|
|
||||||
usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
|
|
||||||
|
|
||||||
|
# if 'taxoURI' in config['obi'] : # TODO default None problem
|
||||||
|
# taxo = open_uri(config['obi']['taxoURI'])
|
||||||
|
# else :
|
||||||
|
taxo = None
|
||||||
|
|
||||||
|
usm(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
|
||||||
|
|
||||||
# if 'merge' in config['uniq'] :
|
# if 'merge' in config['uniq'] :
|
||||||
# merged_keys=set(config['uniq']['merge'])
|
# merged_keys=set(config['uniq']['merge'])
|
||||||
# else:
|
# else:
|
||||||
|
Reference in New Issue
Block a user