obi uniq: added taxonomy handling
This commit is contained in:
@ -5,4 +5,5 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
|
|||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
|
|
||||||
|
|
||||||
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)
|
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy)
|
||||||
|
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*)
|
||||||
|
@ -7,13 +7,11 @@ from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
|||||||
from obitools3.dms.column.column cimport Column, Column_line
|
from obitools3.dms.column.column cimport Column, Column_line
|
||||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
|
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
|
||||||
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
|
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
|
||||||
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
|
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption, addTaxonomyInputOption
|
||||||
from obitools3.uri.decode import open_uri
|
from obitools3.uri.decode import open_uri
|
||||||
from obitools3.apps.config import logger
|
from obitools3.apps.config import logger
|
||||||
|
|
||||||
|
|
||||||
# TODO silence non-implemented options
|
|
||||||
|
|
||||||
__title__="Groups records together"
|
__title__="Groups records together"
|
||||||
|
|
||||||
|
|
||||||
@ -22,7 +20,7 @@ def addOptions(parser):
|
|||||||
|
|
||||||
addSequenceInputOption(parser)
|
addSequenceInputOption(parser)
|
||||||
addMinimalOutputOption(parser)
|
addMinimalOutputOption(parser)
|
||||||
|
addTaxonomyInputOption(parser)
|
||||||
|
|
||||||
group = parser.add_argument_group('obi uniq specific options')
|
group = parser.add_argument_group('obi uniq specific options')
|
||||||
|
|
||||||
@ -46,17 +44,138 @@ def addOptions(parser):
|
|||||||
"used to group sequences before dereplication "
|
"used to group sequences before dereplication "
|
||||||
"(option can be used several times).")
|
"(option can be used several times).")
|
||||||
|
|
||||||
group.add_argument('--prefix', '-p',
|
# group.add_argument('--prefix', '-p',
|
||||||
action="store_true", dest="uniq:prefix",
|
# action="store_true", dest="uniq:prefix",
|
||||||
default=False,
|
# default=False,
|
||||||
help="Dereplication is done based on prefix matching: "
|
# help="Dereplication is done based on prefix matching: "
|
||||||
"(i) The shortest sequence of each group is a prefix "
|
# "(i) The shortest sequence of each group is a prefix "
|
||||||
"of any sequence of its group (ii) Two shortest "
|
# "of any sequence of its group (ii) Two shortest "
|
||||||
"sequences of any couple of groups are not the"
|
# "sequences of any couple of groups are not the"
|
||||||
"prefix of the other one.")
|
# "prefix of the other one.")
|
||||||
|
|
||||||
|
|
||||||
cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
|
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||||
|
|
||||||
|
cdef int taxid
|
||||||
|
cdef Line seq
|
||||||
|
cdef list m_taxids
|
||||||
|
cdef bytes k
|
||||||
|
cdef object tsp
|
||||||
|
cdef object tgn
|
||||||
|
cdef object tfa
|
||||||
|
cdef object sp_sn
|
||||||
|
cdef object gn_sn
|
||||||
|
cdef object fa_sn
|
||||||
|
|
||||||
|
# Create columns
|
||||||
|
if 'species' in o_view and o_view['species'].data_type_int != OBI_INT :
|
||||||
|
o_view.delete_column('species')
|
||||||
|
if 'species' not in o_view:
|
||||||
|
Column.new_column(o_view,
|
||||||
|
'species',
|
||||||
|
OBI_INT
|
||||||
|
)
|
||||||
|
|
||||||
|
if 'genus' in o_view and o_view['genus'].data_type_int != OBI_INT :
|
||||||
|
o_view.delete_column('genus')
|
||||||
|
if 'genus' not in o_view:
|
||||||
|
Column.new_column(o_view,
|
||||||
|
'genus',
|
||||||
|
OBI_INT
|
||||||
|
)
|
||||||
|
|
||||||
|
if 'family' in o_view and o_view['family'].data_type_int != OBI_INT :
|
||||||
|
o_view.delete_column('family')
|
||||||
|
if 'family' not in o_view:
|
||||||
|
Column.new_column(o_view,
|
||||||
|
'family',
|
||||||
|
OBI_INT
|
||||||
|
)
|
||||||
|
|
||||||
|
if 'species_name' in o_view and o_view['species_name'].data_type_int != OBI_STR :
|
||||||
|
o_view.delete_column('species_name')
|
||||||
|
if 'species_name' not in o_view:
|
||||||
|
Column.new_column(o_view,
|
||||||
|
'species_name',
|
||||||
|
OBI_STR
|
||||||
|
)
|
||||||
|
|
||||||
|
if 'genus_name' in o_view and o_view['genus_name'].data_type_int != OBI_STR :
|
||||||
|
o_view.delete_column('genus_name')
|
||||||
|
if 'genus_name' not in o_view:
|
||||||
|
Column.new_column(o_view,
|
||||||
|
'genus_name',
|
||||||
|
OBI_STR
|
||||||
|
)
|
||||||
|
|
||||||
|
if 'family_name' in o_view and o_view['family_name'].data_type_int != OBI_STR :
|
||||||
|
o_view.delete_column('family_name')
|
||||||
|
if 'family_name' not in o_view:
|
||||||
|
Column.new_column(o_view,
|
||||||
|
'family_name',
|
||||||
|
OBI_STR
|
||||||
|
)
|
||||||
|
|
||||||
|
if 'rank' in o_view and o_view['rank'].data_type_int != OBI_STR :
|
||||||
|
o_view.delete_column('rank')
|
||||||
|
if 'rank' not in o_view:
|
||||||
|
Column.new_column(o_view,
|
||||||
|
'rank',
|
||||||
|
OBI_STR
|
||||||
|
)
|
||||||
|
|
||||||
|
if 'scientific_name' in o_view and o_view['scientific_name'].data_type_int != OBI_STR :
|
||||||
|
o_view.delete_column('scientific_name')
|
||||||
|
if 'scientific_name' not in o_view:
|
||||||
|
Column.new_column(o_view,
|
||||||
|
'scientific_name',
|
||||||
|
OBI_STR
|
||||||
|
)
|
||||||
|
|
||||||
|
for seq in o_view:
|
||||||
|
|
||||||
|
if 'merged_taxid' in seq :
|
||||||
|
m_taxids = []
|
||||||
|
for k in seq['merged_taxid'].keys() :
|
||||||
|
if seq['merged_taxid'][k] is not None:
|
||||||
|
m_taxids.append(int(k))
|
||||||
|
taxid = taxonomy.last_common_taxon(*m_taxids)
|
||||||
|
seq['taxid'] = taxid
|
||||||
|
tsp = taxonomy.get_species(taxid)
|
||||||
|
tgn = taxonomy.get_genus(taxid)
|
||||||
|
tfa = taxonomy.get_family(taxid)
|
||||||
|
|
||||||
|
if tsp is not None:
|
||||||
|
sp_sn = taxonomy.get_scientific_name(tsp)
|
||||||
|
else:
|
||||||
|
sp_sn = None # TODO was '###', discuss
|
||||||
|
tsp = None # TODO was '-1', discuss
|
||||||
|
|
||||||
|
if tgn is not None:
|
||||||
|
gn_sn = taxonomy.get_scientific_name(tgn)
|
||||||
|
else:
|
||||||
|
gn_sn = None
|
||||||
|
tgn = None
|
||||||
|
|
||||||
|
if tfa is not None:
|
||||||
|
fa_sn = taxonomy.get_scientific_name(tfa)
|
||||||
|
else:
|
||||||
|
fa_sn = None
|
||||||
|
tfa = None
|
||||||
|
|
||||||
|
seq['species'] = tsp
|
||||||
|
seq['genus'] = tgn
|
||||||
|
seq['family'] = tfa
|
||||||
|
|
||||||
|
seq['species_name'] = sp_sn
|
||||||
|
seq['genus_name'] = gn_sn
|
||||||
|
seq['family_name'] = fa_sn
|
||||||
|
|
||||||
|
seq['rank'] = taxonomy.get_rank(taxid)
|
||||||
|
seq['scientific_name'] = taxonomy.get_scientific_name(taxid)
|
||||||
|
|
||||||
|
|
||||||
|
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None) :
|
||||||
|
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef int o_idx
|
cdef int o_idx
|
||||||
@ -80,14 +199,16 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
|
|||||||
cdef Column_line i_mcol
|
cdef Column_line i_mcol
|
||||||
cdef list catl
|
cdef list catl
|
||||||
cdef dict merged_ids_dict
|
cdef dict merged_ids_dict
|
||||||
|
cdef int nb_ids_max
|
||||||
cdef bytes k
|
cdef set ids_list
|
||||||
cdef int n
|
cdef bytes k
|
||||||
|
cdef int n
|
||||||
|
cdef dict taxid_dist_dict
|
||||||
|
|
||||||
#print(categories)
|
#print(categories)
|
||||||
|
|
||||||
uniques = {}
|
uniques = {}
|
||||||
|
|
||||||
if mergedKeys_list is not None:
|
if mergedKeys_list is not None:
|
||||||
mergedKeys=set(mergedKeys_list)
|
mergedKeys=set(mergedKeys_list)
|
||||||
else:
|
else:
|
||||||
@ -95,10 +216,10 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
|
|||||||
|
|
||||||
if taxonomy is not None:
|
if taxonomy is not None:
|
||||||
mergedKeys.add('taxid')
|
mergedKeys.add('taxid')
|
||||||
|
|
||||||
if categories is None:
|
if categories is None:
|
||||||
categories = []
|
categories = []
|
||||||
|
|
||||||
# Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
|
# Going through columns to merge a first time to create merged columns with the good number of elements per line and elemnts names
|
||||||
logger("info", "First browsing through the input")
|
logger("info", "First browsing through the input")
|
||||||
merged_infos = {}
|
merged_infos = {}
|
||||||
@ -132,11 +253,12 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
|
|||||||
comments=i_col.comments,
|
comments=i_col.comments,
|
||||||
alias=merged_col_name # TODO what if it already exists
|
alias=merged_col_name # TODO what if it already exists
|
||||||
)
|
)
|
||||||
|
|
||||||
del(merged_infos)
|
del(merged_infos)
|
||||||
|
|
||||||
logger("info", "Second browsing through the input")
|
logger("info", "Second browsing through the input")
|
||||||
merged_ids_dict = {}
|
merged_ids_dict = {}
|
||||||
|
taxid_dist_dict = {}
|
||||||
i = 0
|
i = 0
|
||||||
o_idx = 0
|
o_idx = 0
|
||||||
seq_col = view[NUC_SEQUENCE_COLUMN]
|
seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||||
@ -167,19 +289,17 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
|
|||||||
o_seq = o_view[u_idx]
|
o_seq = o_view[u_idx]
|
||||||
o_seq[COUNT_COLUMN] += i_count
|
o_seq[COUNT_COLUMN] += i_count
|
||||||
|
|
||||||
# if taxonomy is not None and 'taxid' in seq:
|
|
||||||
# s['merged_taxid'][seq['taxid']]=
|
|
||||||
for key in mergedKeys:
|
for key in mergedKeys:
|
||||||
# if key=='taxid' and mergeIds: # TODO
|
if key=='taxid' and mergeIds:
|
||||||
# if 'taxid_dist' in i_seq:
|
if 'taxid_dist' in i_seq:
|
||||||
# u_seq["taxid_dist"].update(i_seq["taxid_dist"])
|
taxid_dist_dict[o_seq.id].update(i_seq["taxid_dist"])
|
||||||
# if 'taxid' in i_seq:
|
if 'taxid' in i_seq:
|
||||||
# u_seq["taxid_dist"][i_seq.id] = i_seq['taxid']
|
taxid_dist_dict[o_seq.id][i_seq.id] = i_seq['taxid']
|
||||||
|
|
||||||
mkey = "merged_%s" % key
|
mkey = "merged_%s" % key
|
||||||
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
|
||||||
if key in i_seq:
|
if key in i_seq:
|
||||||
to_merge = i_seq[key]
|
to_merge = str(i_seq[key])
|
||||||
mcol = o_seq[mkey]
|
mcol = o_seq[mkey]
|
||||||
if mcol[to_merge] is None:
|
if mcol[to_merge] is None:
|
||||||
mcol[to_merge] = i_count
|
mcol[to_merge] = i_count
|
||||||
@ -214,35 +334,36 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
|
|||||||
|
|
||||||
if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
|
if COUNT_COLUMN not in o_seq or o_seq[COUNT_COLUMN] is None:
|
||||||
o_seq[COUNT_COLUMN] = 1
|
o_seq[COUNT_COLUMN] = 1
|
||||||
|
|
||||||
for key in mergedKeys:
|
for key in mergedKeys:
|
||||||
# if key=='taxid' and mergeIds:
|
if key=='taxid' and mergeIds:
|
||||||
# if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
|
if 'taxid' in o_seq and 'taxid_dist' not in o_seq:
|
||||||
# u_seq["taxid_dist"] = {}
|
taxid_dist_dict[i_seq.id] = {}
|
||||||
# else :
|
if 'taxid' in o_seq:
|
||||||
# u_seq["taxid_dist"] = o_seq["taxid_dist"]
|
taxid_dist_dict[i_seq.id][o_seq.id] = o_seq['taxid']
|
||||||
# if 'taxid' in o_seq:
|
|
||||||
# u_seq["taxid_dist"][o_seq.id] = o_seq['taxid']
|
|
||||||
mkey = "merged_%s" % key
|
mkey = "merged_%s" % key
|
||||||
if key in o_seq:
|
if key in o_seq:
|
||||||
to_merge = o_seq[key]
|
to_merge = str(o_seq[key])
|
||||||
mcol = o_seq[mkey]
|
mcol = o_seq[mkey]
|
||||||
if to_merge in mcol and mcol[to_merge] is not None:
|
if to_merge in mcol and mcol[to_merge] is not None:
|
||||||
mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
|
mcol[to_merge] = mcol[to_merge] + o_seq[COUNT_COLUMN]
|
||||||
else:
|
else:
|
||||||
mcol[to_merge] = o_seq[COUNT_COLUMN]
|
mcol[to_merge] = o_seq[COUNT_COLUMN]
|
||||||
o_seq[key] = None # TODO delete column eventually -> make C function?
|
o_seq[key] = None # TODO delete column eventually -> make C function that deletes 'empty' columns
|
||||||
|
|
||||||
if mergeIds:
|
if mergeIds:
|
||||||
merged_ids_dict[o_seq.id] = [o_seq.id] # TODO check that this id is added too in the original obiuniq
|
merged_ids_dict[o_seq.id] = [o_seq.id]
|
||||||
#o_seq['merged']=[o_seq.id]
|
#o_seq['merged']=[o_seq.id]
|
||||||
|
|
||||||
i+=1
|
i+=1
|
||||||
|
|
||||||
# Merged ids column
|
# Merged ids column
|
||||||
if mergeIds :
|
if mergeIds :
|
||||||
|
logger("info", "Building merged ids column")
|
||||||
nb_ids_max = 0
|
nb_ids_max = 0
|
||||||
|
ids_list = set()
|
||||||
for k in merged_ids_dict :
|
for k in merged_ids_dict :
|
||||||
|
ids_list = set(list(ids_list) + merged_ids_dict[k])
|
||||||
n = len(merged_ids_dict[k])
|
n = len(merged_ids_dict[k])
|
||||||
if n > nb_ids_max :
|
if n > nb_ids_max :
|
||||||
nb_ids_max = n
|
nb_ids_max = n
|
||||||
@ -259,9 +380,24 @@ cdef uniqSequence(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list
|
|||||||
for o_seq in o_view:
|
for o_seq in o_view:
|
||||||
o_seq['merged'] = merged_ids_dict[o_seq.id]
|
o_seq['merged'] = merged_ids_dict[o_seq.id]
|
||||||
|
|
||||||
#TODO
|
if mergeIds and 'taxid' in mergedKeys:
|
||||||
#if taxonomy is not None:
|
logger("info", "Building taxid dist column")
|
||||||
# mergeTaxonomyClassification(uniqSeq, taxonomy)
|
Column.new_column(o_view,
|
||||||
|
"taxid_dist",
|
||||||
|
OBI_INT,
|
||||||
|
nb_elements_per_line=len(ids_list),
|
||||||
|
elements_names=list(ids_list),
|
||||||
|
comments="obi uniq taxid dist",
|
||||||
|
alias="taxid_dist" # TODO what if it already exists
|
||||||
|
)
|
||||||
|
|
||||||
|
for o_seq in o_view:
|
||||||
|
if o_seq.id in taxid_dist_dict :
|
||||||
|
o_seq['taxid_dist'] = taxid_dist_dict[o_seq.id]
|
||||||
|
|
||||||
|
if taxonomy is not None:
|
||||||
|
logger("info", "Merging taxonomy classification")
|
||||||
|
merge_taxonomy_classification(o_view, taxonomy)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -269,6 +405,8 @@ def run(config):
|
|||||||
|
|
||||||
cdef tuple input
|
cdef tuple input
|
||||||
cdef tuple output
|
cdef tuple output
|
||||||
|
cdef tuple taxo_uri
|
||||||
|
cdef Taxonomy taxo
|
||||||
cdef View_NUC_SEQS entries
|
cdef View_NUC_SEQS entries
|
||||||
cdef View_NUC_SEQS o_view
|
cdef View_NUC_SEQS o_view
|
||||||
cdef ProgressBar pb
|
cdef ProgressBar pb
|
||||||
@ -288,7 +426,12 @@ def run(config):
|
|||||||
|
|
||||||
entries = input[1]
|
entries = input[1]
|
||||||
o_view = output[1]
|
o_view = output[1]
|
||||||
|
if 'taxoURI' in config['obi'] : # TODO default None problem
|
||||||
|
taxo_uri = open_uri(config['obi']['taxoURI'])
|
||||||
|
taxo = taxo_uri[1]
|
||||||
|
else :
|
||||||
|
taxo = None
|
||||||
|
|
||||||
# Initialize the progress bar
|
# Initialize the progress bar
|
||||||
pb = ProgressBar(len(entries), config, seconde=5)
|
pb = ProgressBar(len(entries), config, seconde=5)
|
||||||
|
|
||||||
@ -296,26 +439,13 @@ def run(config):
|
|||||||
# if options.prefix:
|
# if options.prefix:
|
||||||
# usm = uniqPrefixSequence
|
# usm = uniqPrefixSequence
|
||||||
# else:
|
# else:
|
||||||
usm = uniqSequence
|
usm = uniq_sequences
|
||||||
|
|
||||||
# if 'taxoURI' in config['obi'] : # TODO default None problem
|
|
||||||
# taxo = open_uri(config['obi']['taxoURI'])
|
|
||||||
# else :
|
|
||||||
taxo = None
|
|
||||||
|
|
||||||
usm(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
|
usm(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'])
|
||||||
|
|
||||||
# if 'merge' in config['uniq'] :
|
|
||||||
# merged_keys=set(config['uniq']['merge'])
|
|
||||||
# else:
|
|
||||||
# merged_keys=set()
|
|
||||||
#
|
|
||||||
# if 'taxo' in config['obi'] :
|
|
||||||
# merged_keys.add('taxid')
|
|
||||||
|
|
||||||
print("\n")
|
print("\n")
|
||||||
print(repr(o_view))
|
print(repr(o_view))
|
||||||
|
|
||||||
input[0].close()
|
input[0].close()
|
||||||
output[0].close()
|
output[0].close()
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user