New obi uniq

This commit is contained in:
Celine Mercier
2017-08-20 18:04:21 +02:00
parent 6a2759eee6
commit 73bca6288f
2 changed files with 184 additions and 195 deletions

View File

@ -0,0 +1,8 @@
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*)

View File

@ -1,59 +1,29 @@
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.dms import DMS # TODO cimport doesn't work
from obitools3.dms.view.view import View # TODO cimport doesn't work
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS
from obitools3.dms.obiseq cimport Nuc_Seq
from obitools3.dms.column.column cimport Column
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_INT
from obitools3.utils cimport tostr
from obitools3.dms.dms cimport DMS
from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.dms.view.view cimport View, Line
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column, Column_line
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_INT, index_t
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
# TODO silence non-implemented options
__title__="Groups records together"
default_config = { 'inputview' : None,
'outputview' : None
}
def addOptions(parser):
# TODO put this common group somewhere else but I don't know where
group=parser.add_argument_group('DMS and view options')
addSequenceInputOption(parser)
addMinimalOutputOption(parser)
group.add_argument('--default-dms','-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data.")
group.add_argument('--input-view','-i',
action="store", dest="obi:inputview",
metavar='<INPUT VIEW NAME>',
default=None,
type=str,
help="Name of the input view, either raw if the view is in the default DMS,"
" or in the form 'dms:view' if it is in another DMS.")
group.add_argument('--output-view','-o',
action="store", dest="obi:outputview",
metavar='<OUTPUT VIEW NAME>',
default=None,
type=str,
help="Name of the output view, either raw if the view is in the default DMS,"
" or in the form 'dms:view' if it is in another DMS.")
group.add_argument('--taxo','-t',
action="store", dest="obi:taxo",
metavar='<TAXONOMY NAME>',
default='', # TODO not None because if it's None, the option is not entered in the option dictionary.
type=str,
help="Name of the taxonomy to use.")
group = parser.add_argument_group('obi uniq specific options')
@ -89,152 +59,198 @@ def addOptions(parser):
# TODO taxonomy
# TODO
COUNT_COLUMN_str = tostr(COUNT_COLUMN)
def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) :
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) :
cdef int i
cdef int o_idx
cdef int u_idx
cdef int u_id
cdef int i_count
cdef set mergedKeys
cdef dict uniques
cdef dict merged_infos
cdef object iter_view
cdef Line i_seq
cdef Line o_seq
cdef str key
cdef bytes key_b
cdef str mkey
cdef str merged_col_name
cdef Column i_col
cdef Column seq_col
cdef object to_merge
cdef Column_line mcol
cdef Column_line i_mcol
uniques = {}
if categories is None:
categories=[]
if mergedKey is not None:
mergedKey=set(mergedKey)
if mergedKeys_list is not None:
mergedKeys=set(mergedKeys_list)
else:
mergedKey=set()
mergedKeys=set()
if taxonomy is not None:
mergedKey.add('taxid')
# if taxonomy is not None:
# mergedKeys.add('taxid')
# Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure
o_idx = 0
# Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
#logger("info", "obi uniq", "First browsing through the input")
merged_infos = {}
i = 0
seq_col = view[NUC_SEQUENCE_COLUMN]
iter_view = iter(view)
for i_seq in iter_view :
pass
for i_seq in iter_view:
pb(i)
for key in mergedKeys:
mkey = "merged_%s" % key
if key in i_seq: # TODO what if mkey already in i_seq?
if mkey not in merged_infos:
merged_infos[mkey] = {}
mkey_infos = merged_infos[mkey]
mkey_infos['nb_elts'] = 1
mkey_infos['elt_names'] = [i_seq[key]]
else:
mkey_infos = merged_infos[mkey]
if i_seq[key] not in mkey_infos['elt_names']: # TODO make faster? but how?
mkey_infos['elt_names'].append(i_seq[key])
mkey_infos['nb_elts'] += 1
i+=1
for key in mergedKeys:
merged_col_name = "merged_%s" % key
i_col = view[key]
Column.new_column(o_view,
merged_col_name,
OBI_INT,
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
elements_names=merged_infos[merged_col_name]['elt_names'],
comments=i_col.comments,
alias=merged_col_name # TODO what if it already exists
)
del(merged_infos)
#logger("info", "obi uniq", "Second browsing through the input")
i = 0
o_idx = 0
seq_col = view[NUC_SEQUENCE_COLUMN]
iter_view = iter(view)
for i_seq in iter_view :
pb(i)
# utiliser l'index des AVLs, faire l'API
#u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),)
#u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)
u_id = seq_col.get_line_idx(i)
if u_id in uniques:
u_seq = uniques[u_id]
o_seq = o_view[u_seq['idx']]
if COUNT_COLUMN_str in i_seq:
o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str]
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
i_count = 1
else:
o_seq[COUNT_COLUMN_str] += 1
# seq['COUNT']=1
i_count = i_seq[COUNT_COLUMN]
u_idx = uniques[u_id]
o_seq = o_view[u_idx]
o_seq[COUNT_COLUMN] += i_count
# if taxonomy is not None and 'taxid' in seq:
# s['merged_taxid'][seq['taxid']]=
for key in mergedKey:
if key=='taxid' and mergeIds: # TODO
if 'taxid_dist' in i_seq:
u_seq["taxid_dist"].update(i_seq["taxid_dist"])
if 'taxid' in i_seq:
u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
mkey = "merged_%s" % key
for key in mergedKeys:
# if key=='taxid' and mergeIds: # TODO
# if 'taxid_dist' in i_seq:
# u_seq["taxid_dist"].update(i_seq["taxid_dist"])
# if 'taxid' in i_seq:
# u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
mkey = "merged_%s" % key
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
if key in i_seq:
u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str]
to_merge = i_seq[key]
mcol = o_seq[mkey]
if mcol[to_merge] is None:
mcol[to_merge] = i_count
else:
mcol[to_merge] = mcol[to_merge] + i_count
#cas ou merged_keys existe deja
else:
else: # TODO is this a good else
if mkey in i_seq:
for skey in i_seq[mkey]:
u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey]
for key in i_seq.keys():
mcol = o_seq[mkey]
i_mcol = i_seq[mkey]
for key_b in i_mcol:
if mcol[key_b] is None:
mcol[key_b] = i_mcol[key_b]
else:
mcol[key_b] = mcol[key_b] + i_mcol[key_b]
for key_b in i_seq.keys():
# Merger proprement l'attribut merged s'il existe
if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this
o_seq[key] = None
if mergeIds:
u_seq['merged'].append(i_seq.id)
else:
# TODO make special columns list?
if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] :
o_seq[key_b] = None
# if mergeIds: # TODO
# u_seq['merged'].append(i_seq.id)
else:
o_view[o_idx] = i_seq
o_seq = o_view[o_idx]
uniques[u_id] = {'idx':o_idx}
u_seq = uniques[u_id]
uniques[u_id] = o_idx
o_idx += 1
if COUNT_COLUMN_str not in o_seq:
o_seq[COUNT_COLUMN_str] = 1
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
u_seq["taxid_dist"] = {}
else :
u_seq["taxid_dist"] = o_seq["taxid_dist"]
if 'taxid' in o_seq:
u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
mkey = "merged_%s" % key
if mkey not in o_seq:
u_seq[mkey]={}
else :
u_seq[mkey] = o_seq[mkey]
if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
o_seq[COUNT_COLUMN] = 1
for key in mergedKeys:
# if key=='taxid' and mergeIds:
# if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
# u_seq["taxid_dist"] = {}
# else :
# u_seq["taxid_dist"] = o_seq["taxid_dist"]
# if 'taxid' in o_seq:
# u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
mkey = "merged_%s" % key
if key in o_seq:
u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str]
o_seq[key] = None
if mergeIds:
u_seq['merged']=[o_seq.id]
to_merge = o_seq[key]
mcol = o_seq[mkey]
if to_merge in mcol and mcol[to_merge] is not None:
mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
else:
mcol[to_merge] = o_seq[COUNT_COLUMN]
o_seq[key] = None # TODO delete column eventually -> make C function?
# if mergeIds:
# u_seq['merged']=[o_seq.id]
i+=1
#TODO
#if taxonomy is not None:
# mergeTaxonomyClassification(uniqSeq, taxonomy)
# Get informations to build the columns with merged attributes
merged_infos = {}
for u_id in uniques :
u_seq = uniques[u_id]
for mkey in u_seq :
if mkey != 'idx' :
mkey_dict = u_seq[mkey]
if mkey not in merged_infos :
merged_infos[mkey] = {}
mkey_infos = merged_infos[mkey]
mkey_infos['nb_elts'] = len(mkey_dict.keys())
mkey_infos['elt_names'] = [k for k in mkey_dict]
else :
mkey_infos = merged_infos[mkey]
for k in mkey_dict :
if k not in mkey_infos['elt_names'] :
mkey_infos['elt_names'].append(k)
mkey_infos['nb_elts'] += 1
keys_to_del = []
for k in merged_infos :
if merged_infos[k]['nb_elts'] == 0:
keys_to_del.append(k)
for k in keys_to_del :
del merged_infos[k]
return (uniques, merged_infos)
def run(config):
# TODO declare variables
cdef tuple input
cdef tuple output
cdef View_NUC_SEQS entries
cdef View_NUC_SEQS o_view
cdef ProgressBar pb
logger("info","obi uniq")
# Open DMS
d = DMS.open(config['obi']['defaultdms'])
# Open input view
entries = View.open(d, config['obi']['inputview'])
input = open_uri(config['obi']['inputURI'])
if input[2] != View_NUC_SEQS:
raise NotImplementedError('obi uniq only works on NUC_SEQS views')
output = open_uri(config['obi']['outputURI'],
input=False,
newviewtype=View_NUC_SEQS)
entries = input[1]
o_view = output[1]
# Initialize the progress bar
pb = ProgressBar(len(entries), config, seconde=5)
@ -243,55 +259,20 @@ def run(config):
# usm = uniqPrefixSequence
# else:
usm = uniqSequence
# Create output view
view_class = View.get_view_class(entries.type)
if view_class == View_NUC_SEQS :
get_quality = tostr(QUALITY_COLUMN) in entries # TODO
o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality)
else :
o_view = view_class.new(d, config['obi']['outputview'])
(uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories'])
usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
if 'merge' in config['uniq'] :
merged_keys=set(config['uniq']['merge'])
else:
merged_keys=set()
if 'taxo' in config['obi'] :
merged_keys.add('taxid')
# TODO gotta handle special merged columns
for k in merged_keys:
merged_col_name = "merged_%s" % k
if merged_col_name in merged_infos :
i_col = entries[k]
Column.new_column(o_view,
merged_col_name,
OBI_INT,
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
elements_names=merged_infos[merged_col_name]['elt_names'],
comments=i_col.comments,
alias=merged_col_name
)
for u_id in uniques:
u_dict = uniques[u_id]
for merged_k in u_dict :
if merged_k in merged_infos : # TODO don't enter irrelevant keys to begin with, instead
o_view[u_dict['idx']][merged_k] = u_dict[merged_k]
# if 'merge' in config['uniq'] :
# merged_keys=set(config['uniq']['merge'])
# else:
# merged_keys=set()
#
# if 'taxo' in config['obi'] :
# merged_keys.add('taxid')
print("\n")
print(repr(o_view))
d.close()
input[0].close()
output[0].close()