obi uniq: first version
This commit is contained in:
297
python/obitools3/commands/uniq.pyx
Normal file
297
python/obitools3/commands/uniq.pyx
Normal file
@ -0,0 +1,297 @@
|
||||
#cython: language_level=3
|
||||
|
||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||
from obitools3.dms.dms import DMS # TODO cimport doesn't work
|
||||
from obitools3.dms.view.view import View # TODO cimport doesn't work
|
||||
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS
|
||||
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||
from obitools3.dms.column.column cimport Column
|
||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN
|
||||
from obitools3.dms.capi.obitypes cimport OBI_INT
|
||||
from obitools3.utils cimport tostr
|
||||
|
||||
|
||||
# TODO silence non-implemented options
|
||||
|
||||
__title__="Groups records together"
|
||||
|
||||
default_config = { 'inputview' : None,
|
||||
'outputview' : None
|
||||
}
|
||||
|
||||
def addOptions(parser):
|
||||
|
||||
# TODO put this common group somewhere else but I don't know where
|
||||
group=parser.add_argument_group('DMS and view options')
|
||||
|
||||
group.add_argument('--default-dms','-d',
|
||||
action="store", dest="obi:defaultdms",
|
||||
metavar='<DMS NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the default DMS for reading and writing data.")
|
||||
|
||||
group.add_argument('--input-view','-i',
|
||||
action="store", dest="obi:inputview",
|
||||
metavar='<INPUT VIEW NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the input view, either raw if the view is in the default DMS,"
|
||||
" or in the form 'dms:view' if it is in another DMS.")
|
||||
|
||||
group.add_argument('--output-view','-o',
|
||||
action="store", dest="obi:outputview",
|
||||
metavar='<OUTPUT VIEW NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the output view, either raw if the view is in the default DMS,"
|
||||
" or in the form 'dms:view' if it is in another DMS.")
|
||||
|
||||
group.add_argument('--taxo','-t',
|
||||
action="store", dest="obi:taxo",
|
||||
metavar='<TAXONOMY NAME>',
|
||||
default='', # TODO not None because if it's None, the option is not entered in the option dictionary.
|
||||
type=str,
|
||||
help="Name of the taxonomy to use.")
|
||||
|
||||
|
||||
group = parser.add_argument_group('obi uniq specific options')
|
||||
|
||||
group.add_argument('--merge', '-m',
|
||||
action="append", dest="uniq:merge",
|
||||
metavar="<TAG NAME>",
|
||||
default=[],
|
||||
type=str,
|
||||
help="Attributes to merge.") # TODO must be a 1 elt/line column
|
||||
|
||||
group.add_argument('--merge-ids', '-e',
|
||||
action="store_true", dest="uniq:mergeids",
|
||||
default=False,
|
||||
help="Add the merged key with all ids of merged sequences.")
|
||||
|
||||
group.add_argument('--category-attribute', '-c',
|
||||
action="append", dest="uniq:categories",
|
||||
metavar="<Attribute Name>",
|
||||
default=[],
|
||||
help="Add one attribute to the list of attributes "
|
||||
"used to group sequences before dereplication "
|
||||
"(option can be used several times).")
|
||||
|
||||
group.add_argument('--prefix', '-p',
|
||||
action="store_true", dest="uniq:prefix",
|
||||
default=False,
|
||||
help="Dereplication is done based on prefix matching: "
|
||||
"(i) The shortest sequence of each group is a prefix "
|
||||
"of any sequence of its group (ii) Two shortest "
|
||||
"sequences of any couple of groups are not the"
|
||||
"prefix of the other one.")
|
||||
|
||||
# TODO taxonomy
|
||||
|
||||
|
||||
# TODO
|
||||
COUNT_COLUMN_str = tostr(COUNT_COLUMN)
|
||||
|
||||
|
||||
def uniqSequence(view, pb, o_view, taxonomy=None, mergedKey=None, mergeIds=False, categories=None) :
|
||||
|
||||
uniques = {}
|
||||
|
||||
if categories is None:
|
||||
categories=[]
|
||||
|
||||
if mergedKey is not None:
|
||||
mergedKey=set(mergedKey)
|
||||
else:
|
||||
mergedKey=set()
|
||||
|
||||
if taxonomy is not None:
|
||||
mergedKey.add('taxid')
|
||||
|
||||
# Faire parcours de la view des colonnes à merged pour créer les merged_col avant et les remplir au fur et à mesure
|
||||
|
||||
o_idx = 0
|
||||
i = 0
|
||||
seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||
|
||||
iter_view = iter(view)
|
||||
for i_seq in iter_view :
|
||||
pass
|
||||
|
||||
iter_view = iter(view)
|
||||
for i_seq in iter_view :
|
||||
pb(i)
|
||||
# utiliser l'index des AVLs, faire l'API
|
||||
#u_id = tuple(i_seq[x] for x in categories) + (str(i_seq),)
|
||||
u_id = seq_col.get_line_idx(i)
|
||||
if u_id in uniques:
|
||||
u_seq = uniques[u_id]
|
||||
o_seq = o_view[u_seq['idx']]
|
||||
if COUNT_COLUMN_str in i_seq:
|
||||
o_seq[COUNT_COLUMN_str] += i_seq[COUNT_COLUMN_str]
|
||||
else:
|
||||
o_seq[COUNT_COLUMN_str] += 1
|
||||
# seq['COUNT']=1
|
||||
# if taxonomy is not None and 'taxid' in seq:
|
||||
# s['merged_taxid'][seq['taxid']]=
|
||||
for key in mergedKey:
|
||||
if key=='taxid' and mergeIds: # TODO
|
||||
if 'taxid_dist' in i_seq:
|
||||
u_seq["taxid_dist"].update(i_seq["taxid_dist"])
|
||||
if 'taxid' in i_seq:
|
||||
u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
|
||||
|
||||
mkey = "merged_%s" % key
|
||||
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
||||
if key in i_seq:
|
||||
u_seq[mkey][i_seq[key]] = u_seq[mkey].get(i_seq[key], 0) + i_seq[COUNT_COLUMN_str]
|
||||
#cas ou merged_keys existe deja
|
||||
else:
|
||||
if mkey in i_seq:
|
||||
for skey in i_seq[mkey]:
|
||||
u_seq[mkey][skey] = u_seq[mkey].get(skey,0) + i_seq[mkey][skey]
|
||||
|
||||
for key in i_seq.keys():
|
||||
# Merger proprement l'attribut merged s'il existe
|
||||
if key in o_seq and o_seq[key] != i_seq[key] and tostr(key) != COUNT_COLUMN_str : #and key[0:7]!='merged_' and key!='merged': TODO check this
|
||||
o_seq[key] = None
|
||||
|
||||
if mergeIds:
|
||||
u_seq['merged'].append(i_seq.id)
|
||||
|
||||
else:
|
||||
|
||||
o_view[o_idx] = i_seq
|
||||
o_seq = o_view[o_idx]
|
||||
uniques[u_id] = {'idx':o_idx}
|
||||
u_seq = uniques[u_id]
|
||||
o_idx += 1
|
||||
|
||||
if COUNT_COLUMN_str not in o_seq:
|
||||
o_seq[COUNT_COLUMN_str] = 1
|
||||
|
||||
for key in mergedKey:
|
||||
if key=='taxid' and mergeIds:
|
||||
if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
|
||||
u_seq["taxid_dist"] = {}
|
||||
else :
|
||||
u_seq["taxid_dist"] = o_seq["taxid_dist"]
|
||||
if 'taxid' in o_seq:
|
||||
u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
|
||||
mkey = "merged_%s" % key
|
||||
if mkey not in o_seq:
|
||||
u_seq[mkey]={}
|
||||
else :
|
||||
u_seq[mkey] = o_seq[mkey]
|
||||
if key in o_seq:
|
||||
u_seq[mkey][o_seq[key]] = u_seq[mkey].get(o_seq[key],0) + o_seq[COUNT_COLUMN_str]
|
||||
o_seq[key] = None
|
||||
|
||||
if mergeIds:
|
||||
u_seq['merged']=[o_seq.id]
|
||||
|
||||
i+=1
|
||||
|
||||
#TODO
|
||||
#if taxonomy is not None:
|
||||
# mergeTaxonomyClassification(uniqSeq, taxonomy)
|
||||
|
||||
# Get informations to build the columns with merged attributes
|
||||
merged_infos = {}
|
||||
for u_id in uniques :
|
||||
u_seq = uniques[u_id]
|
||||
for mkey in u_seq :
|
||||
if mkey != 'idx' :
|
||||
mkey_dict = u_seq[mkey]
|
||||
if mkey not in merged_infos :
|
||||
merged_infos[mkey] = {}
|
||||
mkey_infos = merged_infos[mkey]
|
||||
mkey_infos['nb_elts'] = len(mkey_dict.keys())
|
||||
mkey_infos['elt_names'] = [k for k in mkey_dict]
|
||||
else :
|
||||
mkey_infos = merged_infos[mkey]
|
||||
for k in mkey_dict :
|
||||
if k not in mkey_infos['elt_names'] :
|
||||
mkey_infos['elt_names'].append(k)
|
||||
mkey_infos['nb_elts'] += 1
|
||||
|
||||
keys_to_del = []
|
||||
for k in merged_infos :
|
||||
if merged_infos[k]['nb_elts'] == 0:
|
||||
keys_to_del.append(k)
|
||||
for k in keys_to_del :
|
||||
del merged_infos[k]
|
||||
|
||||
return (uniques, merged_infos)
|
||||
|
||||
|
||||
def run(config):
|
||||
|
||||
# TODO declare variables
|
||||
|
||||
# Open DMS
|
||||
d = DMS.open(config['obi']['defaultdms'])
|
||||
|
||||
# Open input view
|
||||
entries = View.open(d, config['obi']['inputview'])
|
||||
|
||||
# Initialize the progress bar
|
||||
pb = ProgressBar(len(entries), config, seconde=5)
|
||||
|
||||
# TODO
|
||||
# if options.prefix:
|
||||
# usm = uniqPrefixSequence
|
||||
# else:
|
||||
usm = uniqSequence
|
||||
|
||||
# Create output view
|
||||
view_class = View.get_view_class(entries.type)
|
||||
if view_class == View_NUC_SEQS :
|
||||
get_quality = tostr(QUALITY_COLUMN) in entries # TODO
|
||||
o_view = View_NUC_SEQS.new(d, config['obi']['outputview'], quality=get_quality)
|
||||
else :
|
||||
o_view = view_class.new(d, config['obi']['outputview'])
|
||||
|
||||
(uniques, merged_infos) = usm(entries, pb, o_view, config['obi']['taxo'], config['uniq']['merge'], config['uniq']['mergeids'], config['uniq']['categories'])
|
||||
|
||||
if 'merge' in config['uniq'] :
|
||||
merged_keys=set(config['uniq']['merge'])
|
||||
else:
|
||||
merged_keys=set()
|
||||
|
||||
if 'taxo' in config['obi'] :
|
||||
merged_keys.add('taxid')
|
||||
|
||||
# TODO gotta handle special merged columns
|
||||
for k in merged_keys:
|
||||
merged_col_name = "merged_%s" % k
|
||||
if merged_col_name in merged_infos :
|
||||
i_col = entries[k]
|
||||
Column.new_column(o_view,
|
||||
merged_col_name,
|
||||
OBI_INT,
|
||||
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
|
||||
elements_names=merged_infos[merged_col_name]['elt_names'],
|
||||
comments=i_col.comments,
|
||||
alias=merged_col_name
|
||||
)
|
||||
|
||||
for u_id in uniques:
|
||||
u_dict = uniques[u_id]
|
||||
for merged_k in u_dict :
|
||||
if merged_k in merged_infos : # TODO don't enter irrelevant keys to begin with, instead
|
||||
o_view[u_dict['idx']][merged_k] = u_dict[merged_k]
|
||||
|
||||
print("\n")
|
||||
print(repr(o_view))
|
||||
|
||||
d.close()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user