obi uniq: added option to use categories additionally to the sequence to

determine uniqueness
This commit is contained in:
Celine Mercier
2017-09-25 10:56:43 +02:00
parent 5ed6835e0e
commit 75c15594c4
2 changed files with 37 additions and 20 deletions

View File

@ -5,4 +5,4 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*) cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)

View File

@ -2,7 +2,6 @@
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.dms cimport DMS from obitools3.dms.dms cimport DMS
from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.dms.view.view cimport View, Line from obitools3.dms.view.view cimport View, Line
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column, Column_line from obitools3.dms.column.column cimport Column, Column_line
@ -59,12 +58,12 @@ def addOptions(parser):
# TODO taxonomy # TODO taxonomy
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) : cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
cdef int i cdef int i
cdef int o_idx cdef int o_idx
cdef int u_idx cdef int u_idx
cdef int u_id cdef tuple u_id
cdef int i_count cdef int i_count
cdef set mergedKeys cdef set mergedKeys
cdef dict uniques cdef dict uniques
@ -80,23 +79,26 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxo
cdef Column seq_col cdef Column seq_col
cdef object to_merge cdef object to_merge
cdef Column_line mcol cdef Column_line mcol
cdef Column_line i_mcol cdef Column_line i_mcol
cdef list catl
#print(categories)
uniques = {} uniques = {}
if categories is None:
categories=[]
if mergedKeys_list is not None: if mergedKeys_list is not None:
mergedKeys=set(mergedKeys_list) mergedKeys=set(mergedKeys_list)
else: else:
mergedKeys=set() mergedKeys=set()
# if taxonomy is not None: if taxonomy is not None:
# mergedKeys.add('taxid') mergedKeys.add('taxid')
if categories is None:
categories = []
# Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names # Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
#logger("info", "obi uniq", "First browsing through the input") logger("info", "First browsing through the input")
merged_infos = {} merged_infos = {}
i = 0 i = 0
iter_view = iter(view) iter_view = iter(view)
@ -131,20 +133,28 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxo
del(merged_infos) del(merged_infos)
#logger("info", "obi uniq", "Second browsing through the input") logger("info", "Second browsing through the input")
i = 0 i = 0
o_idx = 0 o_idx = 0
seq_col = view[NUC_SEQUENCE_COLUMN] seq_col = view[NUC_SEQUENCE_COLUMN]
iter_view = iter(view) iter_view = iter(view)
for i_seq in iter_view : for i_seq in iter_view :
pb(i) pb(i)
#u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),) # This can't be done in the same line as the u_id tuple creation because it generates a bug
u_id = seq_col.get_line_idx(i) # where Cython (version 0.25.2) does not detect the reference to the categs_list variable and deallocates
# it at the beginning of the function.
# (Only happens if categs_list is an optional parameter, which it is).
catl = []
for x in categories :
catl.append(i_seq[x])
u_id = tuple(catl) + (seq_col.get_line_idx(i),)
#u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),) # The line that cython can't read properly
if u_id in uniques: if u_id in uniques:
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None: if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
i_count = 1 i_count = 1
else: else:
@ -248,9 +258,11 @@ def run(config):
input=False, input=False,
newviewtype=View_NUC_SEQS) newviewtype=View_NUC_SEQS)
# TODO exceptions not handled like they should be
entries = input[1] entries = input[1]
o_view = output[1] o_view = output[1]
# Initialize the progress bar # Initialize the progress bar
pb = ProgressBar(len(entries), config, seconde=5) pb = ProgressBar(len(entries), config, seconde=5)
@ -259,9 +271,14 @@ def run(config):
# usm = uniqPrefixSequence # usm = uniqPrefixSequence
# else: # else:
usm = uniqSequence usm = uniqSequence
usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
# if 'taxoURI' in config['obi'] : # TODO default None problem
# taxo = open_uri(config['obi']['taxoURI'])
# else :
taxo = None
usm(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
# if 'merge' in config['uniq'] : # if 'merge' in config['uniq'] :
# merged_keys=set(config['uniq']['merge']) # merged_keys=set(config['uniq']['merge'])
# else: # else: