New obi uniq
This commit is contained in:
8
python/obitools3/commands/uniq.pxd
Normal file
8
python/obitools3/commands/uniq.pxd
Normal file
@ -0,0 +1,8 @@
|
||||
#cython: language_level=3
|
||||
|
||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||
from obitools3.dms.taxo.taxo cimport Taxonomy
|
||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||
|
||||
|
||||
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=*, list mergedKeys_list=*, bint mergeIds=*, list categories=*)
|
@ -1,59 +1,29 @@
|
||||
#cython: language_level=3
|
||||
|
||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||
from obitools3.dms.dms import DMS # TODO cimport doesn't work
|
||||
from obitools3.dms.view.view import View # TODO cimport doesn't work
|
||||
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS
|
||||
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||
from obitools3.dms.column.column cimport Column
|
||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN
|
||||
from obitools3.dms.capi.obitypes cimport OBI_INT
|
||||
from obitools3.utils cimport tostr
|
||||
from obitools3.dms.dms cimport DMS
|
||||
from obitools3.dms.taxo.taxo cimport Taxonomy
|
||||
from obitools3.dms.view.view cimport View, Line
|
||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||
from obitools3.dms.column.column cimport Column, Column_line
|
||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
|
||||
from obitools3.dms.capi.obitypes cimport OBI_INT, index_t
|
||||
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
|
||||
from obitools3.uri.decode import open_uri
|
||||
from obitools3.apps.config import logger
|
||||
|
||||
|
||||
# TODO silence non-implemented options
|
||||
|
||||
__title__="Groups records together"
|
||||
|
||||
default_config = { 'inputview' : None,
|
||||
'outputview' : None
|
||||
}
|
||||
|
||||
|
||||
def addOptions(parser):
|
||||
|
||||
# TODO put this common group somewhere else but I don't know where
|
||||
group=parser.add_argument_group('DMS and view options')
|
||||
addSequenceInputOption(parser)
|
||||
addMinimalOutputOption(parser)
|
||||
|
||||
group.add_argument('--default-dms','-d',
|
||||
action="store", dest="obi:defaultdms",
|
||||
metavar='<DMS NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the default DMS for reading and writing data.")
|
||||
|
||||
group.add_argument('--input-view','-i',
|
||||
action="store", dest="obi:inputview",
|
||||
metavar='<INPUT VIEW NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the input view, either raw if the view is in the default DMS,"
|
||||
" or in the form 'dms:view' if it is in another DMS.")
|
||||
|
||||
group.add_argument('--output-view','-o',
|
||||
action="store", dest="obi:outputview",
|
||||
metavar='<OUTPUT VIEW NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the output view, either raw if the view is in the default DMS,"
|
||||
" or in the form 'dms:view' if it is in another DMS.")
|
||||
|
||||
group.add_argument('--taxo','-t',
|
||||
action="store", dest="obi:taxo",
|
||||
metavar='<TAXONOMY NAME>',
|
||||
default='', # TODO not None because if it's None, the option is not entered in the option dictionary.
|
||||
type=str,
|
||||
help="Name of the taxonomy to use.")
|
||||
|
||||
|
||||
group = parser.add_argument_group('obi uniq specific options')
|
||||
|
||||
@ -89,152 +59,198 @@ def addOptions(parser):
|
||||
# TODO taxonomy
|
||||
|
||||
|
||||
# TODO
|
||||
COUNT_COLUMN_str = tostr(COUNT_COLUMN)
|
||||
|
||||
|
||||
def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) :
|
||||
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) :
|
||||
|
||||
cdef int i
|
||||
cdef int o_idx
|
||||
cdef int u_idx
|
||||
cdef int u_id
|
||||
cdef int i_count
|
||||
cdef set mergedKeys
|
||||
cdef dict uniques
|
||||
cdef dict merged_infos
|
||||
cdef object iter_view
|
||||
cdef Line i_seq
|
||||
cdef Line o_seq
|
||||
cdef str key
|
||||
cdef bytes key_b
|
||||
cdef str mkey
|
||||
cdef str merged_col_name
|
||||
cdef Column i_col
|
||||
cdef Column seq_col
|
||||
cdef object to_merge
|
||||
cdef Column_line mcol
|
||||
cdef Column_line i_mcol
|
||||
|
||||
uniques = {}
|
||||
|
||||
if categories is None:
|
||||
categories=[]
|
||||
|
||||
if mergedKey is not None:
|
||||
mergedKey=set(mergedKey)
|
||||
if mergedKeys_list is not None:
|
||||
mergedKeys=set(mergedKeys_list)
|
||||
else:
|
||||
mergedKey=set()
|
||||
mergedKeys=set()
|
||||
|
||||
if taxonomy is not None:
|
||||
mergedKey.add('taxid')
|
||||
# if taxonomy is not None:
|
||||
# mergedKeys.add('taxid')
|
||||
|
||||
# Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure
|
||||
|
||||
o_idx = 0
|
||||
# Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
|
||||
#logger("info", "obi uniq", "First browsing through the input")
|
||||
merged_infos = {}
|
||||
i = 0
|
||||
seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||
|
||||
iter_view = iter(view)
|
||||
for i_seq in iter_view :
|
||||
pass
|
||||
for i_seq in iter_view:
|
||||
pb(i)
|
||||
for key in mergedKeys:
|
||||
mkey = "merged_%s" % key
|
||||
if key in i_seq: # TODO what if mkey already in i_seq?
|
||||
if mkey not in merged_infos:
|
||||
merged_infos[mkey] = {}
|
||||
mkey_infos = merged_infos[mkey]
|
||||
mkey_infos['nb_elts'] = 1
|
||||
mkey_infos['elt_names'] = [i_seq[key]]
|
||||
else:
|
||||
mkey_infos = merged_infos[mkey]
|
||||
if i_seq[key] not in mkey_infos['elt_names']: # TODO make faster? but how?
|
||||
mkey_infos['elt_names'].append(i_seq[key])
|
||||
mkey_infos['nb_elts'] += 1
|
||||
i+=1
|
||||
|
||||
for key in mergedKeys:
|
||||
merged_col_name = "merged_%s" % key
|
||||
i_col = view[key]
|
||||
Column.new_column(o_view,
|
||||
merged_col_name,
|
||||
OBI_INT,
|
||||
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
|
||||
elements_names=merged_infos[merged_col_name]['elt_names'],
|
||||
comments=i_col.comments,
|
||||
alias=merged_col_name # TODO what if it already exists
|
||||
)
|
||||
|
||||
del(merged_infos)
|
||||
|
||||
#logger("info", "obi uniq", "Second browsing through the input")
|
||||
i = 0
|
||||
o_idx = 0
|
||||
seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||
|
||||
iter_view = iter(view)
|
||||
for i_seq in iter_view :
|
||||
pb(i)
|
||||
# utiliser l'index des AVLs, faire l'API
|
||||
#u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),)
|
||||
|
||||
#u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)
|
||||
u_id = seq_col.get_line_idx(i)
|
||||
|
||||
if u_id in uniques:
|
||||
u_seq = uniques[u_id]
|
||||
o_seq = o_view[u_seq['idx']]
|
||||
if COUNT_COLUMN_str in i_seq:
|
||||
o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str]
|
||||
|
||||
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
|
||||
i_count = 1
|
||||
else:
|
||||
o_seq[COUNT_COLUMN_str] += 1
|
||||
# seq['COUNT']=1
|
||||
i_count = i_seq[COUNT_COLUMN]
|
||||
|
||||
u_idx = uniques[u_id]
|
||||
o_seq = o_view[u_idx]
|
||||
o_seq[COUNT_COLUMN] += i_count
|
||||
|
||||
# if taxonomy is not None and 'taxid' in seq:
|
||||
# s['merged_taxid'][seq['taxid']]=
|
||||
for key in mergedKey:
|
||||
if key=='taxid' and mergeIds: # TODO
|
||||
if 'taxid_dist' in i_seq:
|
||||
u_seq["taxid_dist"].update(i_seq["taxid_dist"])
|
||||
if 'taxid' in i_seq:
|
||||
u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
|
||||
|
||||
mkey = "merged_%s" % key
|
||||
for key in mergedKeys:
|
||||
# if key=='taxid' and mergeIds: # TODO
|
||||
# if 'taxid_dist' in i_seq:
|
||||
# u_seq["taxid_dist"].update(i_seq["taxid_dist"])
|
||||
# if 'taxid' in i_seq:
|
||||
# u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
|
||||
|
||||
mkey = "merged_%s" % key
|
||||
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
||||
if key in i_seq:
|
||||
u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str]
|
||||
to_merge = i_seq[key]
|
||||
mcol = o_seq[mkey]
|
||||
if mcol[to_merge] is None:
|
||||
mcol[to_merge] = i_count
|
||||
else:
|
||||
mcol[to_merge] = mcol[to_merge] + i_count
|
||||
#cas ou merged_keys existe deja
|
||||
else:
|
||||
else: # TODO is this a good else
|
||||
if mkey in i_seq:
|
||||
for skey in i_seq[mkey]:
|
||||
u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey]
|
||||
|
||||
for key in i_seq.keys():
|
||||
mcol = o_seq[mkey]
|
||||
i_mcol = i_seq[mkey]
|
||||
for key_b in i_mcol:
|
||||
if mcol[key_b] is None:
|
||||
mcol[key_b] = i_mcol[key_b]
|
||||
else:
|
||||
mcol[key_b] = mcol[key_b] + i_mcol[key_b]
|
||||
|
||||
for key_b in i_seq.keys():
|
||||
# Merger proprement l'attribut merged s'il existe
|
||||
if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this
|
||||
o_seq[key] = None
|
||||
|
||||
if mergeIds:
|
||||
u_seq['merged'].append(i_seq.id)
|
||||
|
||||
else:
|
||||
# TODO make special columns list?
|
||||
if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] :
|
||||
o_seq[key_b] = None
|
||||
|
||||
# if mergeIds: # TODO
|
||||
# u_seq['merged'].append(i_seq.id)
|
||||
|
||||
else:
|
||||
o_view[o_idx] = i_seq
|
||||
o_seq = o_view[o_idx]
|
||||
uniques[u_id] = {'idx':o_idx}
|
||||
u_seq = uniques[u_id]
|
||||
uniques[u_id] = o_idx
|
||||
o_idx += 1
|
||||
|
||||
if COUNT_COLUMN_str not in o_seq:
|
||||
o_seq[COUNT_COLUMN_str] = 1
|
||||
|
||||
for key in mergedKey:
|
||||
if key=='taxid' and mergeIds:
|
||||
if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
|
||||
u_seq["taxid_dist"] = {}
|
||||
else :
|
||||
u_seq["taxid_dist"] = o_seq["taxid_dist"]
|
||||
if 'taxid' in o_seq:
|
||||
u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
|
||||
mkey = "merged_%s" % key
|
||||
if mkey not in o_seq:
|
||||
u_seq[mkey]={}
|
||||
else :
|
||||
u_seq[mkey] = o_seq[mkey]
|
||||
if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
|
||||
o_seq[COUNT_COLUMN] = 1
|
||||
|
||||
for key in mergedKeys:
|
||||
# if key=='taxid' and mergeIds:
|
||||
# if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
|
||||
# u_seq["taxid_dist"] = {}
|
||||
# else :
|
||||
# u_seq["taxid_dist"] = o_seq["taxid_dist"]
|
||||
# if 'taxid' in o_seq:
|
||||
# u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
|
||||
mkey = "merged_%s" % key
|
||||
if key in o_seq:
|
||||
u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str]
|
||||
o_seq[key] = None
|
||||
|
||||
if mergeIds:
|
||||
u_seq['merged']=[o_seq.id]
|
||||
|
||||
to_merge = o_seq[key]
|
||||
mcol = o_seq[mkey]
|
||||
if to_merge in mcol and mcol[to_merge] is not None:
|
||||
mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
|
||||
else:
|
||||
mcol[to_merge] = o_seq[COUNT_COLUMN]
|
||||
o_seq[key] = None # TODO delete column eventually -> make C function?
|
||||
|
||||
# if mergeIds:
|
||||
# u_seq['merged']=[o_seq.id]
|
||||
i+=1
|
||||
|
||||
|
||||
#TODO
|
||||
#if taxonomy is not None:
|
||||
# mergeTaxonomyClassification(uniqSeq, taxonomy)
|
||||
|
||||
# Get informations to build the columns with merged attributes
|
||||
merged_infos = {}
|
||||
for u_id in uniques :
|
||||
u_seq = uniques[u_id]
|
||||
for mkey in u_seq :
|
||||
if mkey != 'idx' :
|
||||
mkey_dict = u_seq[mkey]
|
||||
if mkey not in merged_infos :
|
||||
merged_infos[mkey] = {}
|
||||
mkey_infos = merged_infos[mkey]
|
||||
mkey_infos['nb_elts'] = len(mkey_dict.keys())
|
||||
mkey_infos['elt_names'] = [k for k in mkey_dict]
|
||||
else :
|
||||
mkey_infos = merged_infos[mkey]
|
||||
for k in mkey_dict :
|
||||
if k not in mkey_infos['elt_names'] :
|
||||
mkey_infos['elt_names'].append(k)
|
||||
mkey_infos['nb_elts'] += 1
|
||||
|
||||
keys_to_del = []
|
||||
for k in merged_infos :
|
||||
if merged_infos[k]['nb_elts'] == 0:
|
||||
keys_to_del.append(k)
|
||||
for k in keys_to_del :
|
||||
del merged_infos[k]
|
||||
|
||||
return (uniques, merged_infos)
|
||||
|
||||
|
||||
def run(config):
|
||||
|
||||
# TODO declare variables
|
||||
cdef tuple input
|
||||
cdef tuple output
|
||||
cdef View_NUC_SEQS entries
|
||||
cdef View_NUC_SEQS o_view
|
||||
cdef ProgressBar pb
|
||||
|
||||
logger("info","obi uniq")
|
||||
|
||||
# Open DMS
|
||||
d = DMS.open(config['obi']['defaultdms'])
|
||||
|
||||
# Open input view
|
||||
entries = View.open(d, config['obi']['inputview'])
|
||||
|
||||
input = open_uri(config['obi']['inputURI'])
|
||||
|
||||
if input[2] != View_NUC_SEQS:
|
||||
raise NotImplementedError('obi uniq only works on NUC_SEQS views')
|
||||
|
||||
output = open_uri(config['obi']['outputURI'],
|
||||
input=False,
|
||||
newviewtype=View_NUC_SEQS)
|
||||
|
||||
entries = input[1]
|
||||
o_view = output[1]
|
||||
|
||||
# Initialize the progress bar
|
||||
pb = ProgressBar(len(entries), config, seconde=5)
|
||||
|
||||
@ -243,55 +259,20 @@ def run(config):
|
||||
# usm = uniqPrefixSequence
|
||||
# else:
|
||||
usm = uniqSequence
|
||||
|
||||
# Create output view
|
||||
view_class = View.get_view_class(entries.type)
|
||||
if view_class == View_NUC_SEQS :
|
||||
get_quality = tostr(QUALITY_COLUMN) in entries # TODO
|
||||
o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality)
|
||||
else :
|
||||
o_view = view_class.new(d, config['obi']['outputview'])
|
||||
|
||||
(uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories'])
|
||||
usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
|
||||
|
||||
if 'merge' in config['uniq'] :
|
||||
merged_keys=set(config['uniq']['merge'])
|
||||
else:
|
||||
merged_keys=set()
|
||||
|
||||
if 'taxo' in config['obi'] :
|
||||
merged_keys.add('taxid')
|
||||
|
||||
# TODO gotta handle special merged columns
|
||||
for k in merged_keys:
|
||||
merged_col_name = "merged_%s" % k
|
||||
if merged_col_name in merged_infos :
|
||||
i_col = entries[k]
|
||||
Column.new_column(o_view,
|
||||
merged_col_name,
|
||||
OBI_INT,
|
||||
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
|
||||
elements_names=merged_infos[merged_col_name]['elt_names'],
|
||||
comments=i_col.comments,
|
||||
alias=merged_col_name
|
||||
)
|
||||
|
||||
for u_id in uniques:
|
||||
u_dict = uniques[u_id]
|
||||
for merged_k in u_dict :
|
||||
if merged_k in merged_infos : # TODO don't enter irrelevant keys to begin with, instead
|
||||
o_view[u_dict['idx']][merged_k] = u_dict[merged_k]
|
||||
# if 'merge' in config['uniq'] :
|
||||
# merged_keys=set(config['uniq']['merge'])
|
||||
# else:
|
||||
# merged_keys=set()
|
||||
#
|
||||
# if 'taxo' in config['obi'] :
|
||||
# merged_keys.add('taxid')
|
||||
|
||||
print("\n")
|
||||
print(repr(o_view))
|
||||
|
||||
d.close()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
input[0].close()
|
||||
output[0].close()
|
||||
|
||||
|
Reference in New Issue
Block a user