obi uniq: added option to use categories additionally to the sequence to
determine uniqueness
This commit is contained in:
@ -5,4 +5,4 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
|
||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||
|
||||
|
||||
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*)
|
||||
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)
|
||||
|
@ -2,7 +2,6 @@
|
||||
|
||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||
from obitools3.dms.dms cimport DMS
|
||||
from obitools3.dms.taxo.taxo cimport Taxonomy
|
||||
from obitools3.dms.view.view cimport View, Line
|
||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||
from obitools3.dms.column.column cimport Column, Column_line
|
||||
@ -59,12 +58,12 @@ def addOptions(parser):
|
||||
# TODO taxonomy
|
||||
|
||||
|
||||
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) :
|
||||
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
|
||||
|
||||
cdef int i
|
||||
cdef int o_idx
|
||||
cdef int u_idx
|
||||
cdef int u_id
|
||||
cdef tuple u_id
|
||||
cdef int i_count
|
||||
cdef set mergedKeys
|
||||
cdef dict uniques
|
||||
@ -80,23 +79,26 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxo
|
||||
cdef Column seq_col
|
||||
cdef object to_merge
|
||||
cdef Column_line mcol
|
||||
cdef Column_line i_mcol
|
||||
|
||||
cdef Column_line i_mcol
|
||||
cdef list catl
|
||||
|
||||
#print(categories)
|
||||
|
||||
uniques = {}
|
||||
|
||||
if categories is None:
|
||||
categories=[]
|
||||
|
||||
if mergedKeys_list is not None:
|
||||
mergedKeys=set(mergedKeys_list)
|
||||
else:
|
||||
mergedKeys=set()
|
||||
|
||||
# if taxonomy is not None:
|
||||
# mergedKeys.add('taxid')
|
||||
if taxonomy is not None:
|
||||
mergedKeys.add('taxid')
|
||||
|
||||
if categories is None:
|
||||
categories = []
|
||||
|
||||
# Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
|
||||
#logger("info", "obi uniq", "First browsing through the input")
|
||||
logger("info", "First browsing through the input")
|
||||
merged_infos = {}
|
||||
i = 0
|
||||
iter_view = iter(view)
|
||||
@ -131,20 +133,28 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxo
|
||||
|
||||
del(merged_infos)
|
||||
|
||||
#logger("info", "obi uniq", "Second browsing through the input")
|
||||
logger("info", "Second browsing through the input")
|
||||
i = 0
|
||||
o_idx = 0
|
||||
seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||
|
||||
|
||||
iter_view = iter(view)
|
||||
for i_seq in iter_view :
|
||||
pb(i)
|
||||
|
||||
#u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)
|
||||
u_id = seq_col.get_line_idx(i)
|
||||
# This can't be done in the same line as the u_id tuple creation because it generates a bug
|
||||
# where Cython (version 0.25.2) does not detect the reference to the categs_list variable and deallocates
|
||||
# it at the beginning of the function.
|
||||
# (Only happens if categs_list is an optional parameter, which it is).
|
||||
catl = []
|
||||
for x in categories :
|
||||
catl.append(i_seq[x])
|
||||
|
||||
u_id = tuple(catl) + (seq_col.get_line_idx(i),)
|
||||
#u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),) # The line that cython can't read properly
|
||||
|
||||
if u_id in uniques:
|
||||
|
||||
|
||||
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
|
||||
i_count = 1
|
||||
else:
|
||||
@ -248,9 +258,11 @@ def run(config):
|
||||
input=False,
|
||||
newviewtype=View_NUC_SEQS)
|
||||
|
||||
# TODO exceptions not handled like they should be
|
||||
|
||||
entries = input[1]
|
||||
o_view = output[1]
|
||||
|
||||
|
||||
# Initialize the progress bar
|
||||
pb = ProgressBar(len(entries), config, seconde=5)
|
||||
|
||||
@ -259,9 +271,14 @@ def run(config):
|
||||
# usm = uniqPrefixSequence
|
||||
# else:
|
||||
usm = uniqSequence
|
||||
|
||||
usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
|
||||
|
||||
# if 'taxoURI' in config['obi'] : # TODO default None problem
|
||||
# taxo = open_uri(config['obi']['taxoURI'])
|
||||
# else :
|
||||
taxo = None
|
||||
|
||||
usm(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
|
||||
|
||||
# if 'merge' in config['uniq'] :
|
||||
# merged_keys=set(config['uniq']['merge'])
|
||||
# else:
|
||||
|
Reference in New Issue
Block a user