279 lines
10 KiB
Cython
279 lines
10 KiB
Cython
#cython: language_level=3
|
|
|
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
|
from obitools3.dms.dms cimport DMS
|
|
from obitools3.dms.taxo.taxo cimport Taxonomy
|
|
from obitools3.dms.view.view cimport View, Line
|
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
|
from obitools3.dms.column.column cimport Column, Column_line
|
|
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
|
|
from obitools3.dms.capi.obitypes cimport OBI_INT, index_t
|
|
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
|
|
from obitools3.uri.decode import open_uri
|
|
from obitools3.apps.config import logger
|
|
|
|
|
|
# TODO silence non-implemented options
|
|
|
|
__title__="Groups records together"
|
|
|
|
|
|
|
|
def addOptions(parser):
|
|
|
|
addSequenceInputOption(parser)
|
|
addMinimalOutputOption(parser)
|
|
|
|
|
|
group = parser.add_argument_group('obi uniq specific options')
|
|
|
|
group.add_argument('--merge', '-m',
|
|
action="append", dest="uniq:merge",
|
|
metavar="<TAG NAME>",
|
|
default=[],
|
|
type=str,
|
|
help="Attributes to merge.") # TODO must be a 1 elt/line column
|
|
|
|
group.add_argument('--merge-ids', '-e',
|
|
action="store_true", dest="uniq:mergeids",
|
|
default=False,
|
|
help="Add the merged key with all ids of merged sequences.")
|
|
|
|
group.add_argument('--category-attribute', '-c',
|
|
action="append", dest="uniq:categories",
|
|
metavar="<Attribute Name>",
|
|
default=[],
|
|
help="Add one attribute to the list of attributes "
|
|
"used to group sequences before dereplication "
|
|
"(option can be used several times).")
|
|
|
|
group.add_argument('--prefix', '-p',
|
|
action="store_true", dest="uniq:prefix",
|
|
default=False,
|
|
help="Dereplication is done based on prefix matching: "
|
|
"(i) The shortest sequence of each group is a prefix "
|
|
"of any sequence of its group (ii) Two shortest "
|
|
"sequences of any couple of groups are not the"
|
|
"prefix of the other one.")
|
|
|
|
# TODO taxonomy
|
|
|
|
|
|
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, Taxonomy taxonomy=None, list mergedKeys_list=None, bint mergeIds=False, list categories=None) :
|
|
|
|
cdef int i
|
|
cdef int o_idx
|
|
cdef int u_idx
|
|
cdef int u_id
|
|
cdef int i_count
|
|
cdef set mergedKeys
|
|
cdef dict uniques
|
|
cdef dict merged_infos
|
|
cdef object iter_view
|
|
cdef Line i_seq
|
|
cdef Line o_seq
|
|
cdef str key
|
|
cdef bytes key_b
|
|
cdef str mkey
|
|
cdef str merged_col_name
|
|
cdef Column i_col
|
|
cdef Column seq_col
|
|
cdef object to_merge
|
|
cdef Column_line mcol
|
|
cdef Column_line i_mcol
|
|
|
|
uniques = {}
|
|
|
|
if categories is None:
|
|
categories=[]
|
|
|
|
if mergedKeys_list is not None:
|
|
mergedKeys=set(mergedKeys_list)
|
|
else:
|
|
mergedKeys=set()
|
|
|
|
# if taxonomy is not None:
|
|
# mergedKeys.add('taxid')
|
|
|
|
# Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
|
|
#logger("info", "obi uniq", "First browsing through the input")
|
|
merged_infos = {}
|
|
i = 0
|
|
iter_view = iter(view)
|
|
for i_seq in iter_view:
|
|
pb(i)
|
|
for key in mergedKeys:
|
|
mkey = "merged_%s" % key
|
|
if key in i_seq: # TODO what if mkey already in i_seq?
|
|
if mkey not in merged_infos:
|
|
merged_infos[mkey] = {}
|
|
mkey_infos = merged_infos[mkey]
|
|
mkey_infos['nb_elts'] = 1
|
|
mkey_infos['elt_names'] = [i_seq[key]]
|
|
else:
|
|
mkey_infos = merged_infos[mkey]
|
|
if i_seq[key] not in mkey_infos['elt_names']: # TODO make faster? but how?
|
|
mkey_infos['elt_names'].append(i_seq[key])
|
|
mkey_infos['nb_elts'] += 1
|
|
i+=1
|
|
|
|
for key in mergedKeys:
|
|
merged_col_name = "merged_%s" % key
|
|
i_col = view[key]
|
|
Column.new_column(o_view,
|
|
merged_col_name,
|
|
OBI_INT,
|
|
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
|
|
elements_names=merged_infos[merged_col_name]['elt_names'],
|
|
comments=i_col.comments,
|
|
alias=merged_col_name # TODO what if it already exists
|
|
)
|
|
|
|
del(merged_infos)
|
|
|
|
#logger("info", "obi uniq", "Second browsing through the input")
|
|
i = 0
|
|
o_idx = 0
|
|
seq_col = view[NUC_SEQUENCE_COLUMN]
|
|
|
|
iter_view = iter(view)
|
|
for i_seq in iter_view :
|
|
pb(i)
|
|
|
|
#u_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)
|
|
u_id = seq_col.get_line_idx(i)
|
|
|
|
if u_id in uniques:
|
|
|
|
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
|
|
i_count = 1
|
|
else:
|
|
i_count = i_seq[COUNT_COLUMN]
|
|
|
|
u_idx = uniques[u_id]
|
|
o_seq = o_view[u_idx]
|
|
o_seq[COUNT_COLUMN] += i_count
|
|
|
|
# if taxonomy is not None and 'taxid' in seq:
|
|
# s['merged_taxid'][seq['taxid']]=
|
|
for key in mergedKeys:
|
|
# if key=='taxid' and mergeIds: # TODO
|
|
# if 'taxid_dist' in i_seq:
|
|
# u_seq["taxid_dist"].update(i_seq["taxid_dist"])
|
|
# if 'taxid' in i_seq:
|
|
# u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
|
|
|
|
mkey = "merged_%s" % key
|
|
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
|
if key in i_seq:
|
|
to_merge = i_seq[key]
|
|
mcol = o_seq[mkey]
|
|
if mcol[to_merge] is None:
|
|
mcol[to_merge] = i_count
|
|
else:
|
|
mcol[to_merge] = mcol[to_merge] + i_count
|
|
#cas ou merged_keys existe deja
|
|
else: # TODO is this a good else
|
|
if mkey in i_seq:
|
|
mcol = o_seq[mkey]
|
|
i_mcol = i_seq[mkey]
|
|
for key_b in i_mcol:
|
|
if mcol[key_b] is None:
|
|
mcol[key_b] = i_mcol[key_b]
|
|
else:
|
|
mcol[key_b] = mcol[key_b] + i_mcol[key_b]
|
|
|
|
for key_b in i_seq.keys():
|
|
# Merger proprement l'attribut merged s'il existe
|
|
# TODO make special columns list?
|
|
if key_b != COUNT_COLUMN and key_b != ID_COLUMN and key_b != NUC_SEQUENCE_COLUMN and key_b in o_seq and o_seq[key_b] != i_seq[key_b] :
|
|
o_seq[key_b] = None
|
|
|
|
# if mergeIds: # TODO
|
|
# u_seq['merged'].append(i_seq.id)
|
|
|
|
else:
|
|
o_view[o_idx] = i_seq
|
|
o_seq = o_view[o_idx]
|
|
uniques[u_id] = o_idx
|
|
o_idx += 1
|
|
|
|
if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
|
|
o_seq[COUNT_COLUMN] = 1
|
|
|
|
for key in mergedKeys:
|
|
# if key=='taxid' and mergeIds:
|
|
# if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
|
|
# u_seq["taxid_dist"] = {}
|
|
# else :
|
|
# u_seq["taxid_dist"] = o_seq["taxid_dist"]
|
|
# if 'taxid' in o_seq:
|
|
# u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
|
|
mkey = "merged_%s" % key
|
|
if key in o_seq:
|
|
to_merge = o_seq[key]
|
|
mcol = o_seq[mkey]
|
|
if to_merge in mcol and mcol[to_merge] is not None:
|
|
mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
|
|
else:
|
|
mcol[to_merge] = o_seq[COUNT_COLUMN]
|
|
o_seq[key] = None # TODO delete column eventually -> make C function?
|
|
|
|
# if mergeIds:
|
|
# u_seq['merged']=[o_seq.id]
|
|
i+=1
|
|
|
|
#TODO
|
|
#if taxonomy is not None:
|
|
# mergeTaxonomyClassification(uniqSeq, taxonomy)
|
|
|
|
|
|
|
|
def run(config):
|
|
|
|
cdef tuple input
|
|
cdef tuple output
|
|
cdef View_NUC_SEQS entries
|
|
cdef View_NUC_SEQS o_view
|
|
cdef ProgressBar pb
|
|
|
|
logger("info","obi uniq")
|
|
|
|
input = open_uri(config['obi']['inputURI'])
|
|
|
|
if input[2] != View_NUC_SEQS:
|
|
raise NotImplementedError('obi uniq only works on NUC_SEQS views')
|
|
|
|
output = open_uri(config['obi']['outputURI'],
|
|
input=False,
|
|
newviewtype=View_NUC_SEQS)
|
|
|
|
entries = input[1]
|
|
o_view = output[1]
|
|
|
|
# Initialize the progress bar
|
|
pb = ProgressBar(len(entries), config, seconde=5)
|
|
|
|
# TODO
|
|
# if options.prefix:
|
|
# usm = uniqPrefixSequence
|
|
# else:
|
|
usm = uniqSequence
|
|
|
|
usm(entries, o_view, pb, taxonomy=None, mergedKeys_list=config['uniq']['merge'], mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
|
|
|
|
# if 'merge' in config['uniq'] :
|
|
# merged_keys=set(config['uniq']['merge'])
|
|
# else:
|
|
# merged_keys=set()
|
|
#
|
|
# if 'taxo' in config['obi'] :
|
|
# merged_keys.add('taxid')
|
|
|
|
print("\n")
|
|
print(repr(o_view))
|
|
|
|
input[0].close()
|
|
output[0].close()
|
|
|