Files
obitools3/python/obitools3/commands/import.pyx

250 lines
8.8 KiB
Cython
Raw Normal View History

#cython: language_level=3
2017-07-28 12:41:28 +02:00
import sys
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.view.view cimport View
2017-08-20 17:58:36 +02:00
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column
2017-08-20 17:58:36 +02:00
from obitools3.dms.obiseq cimport Nuc_Seq
from obitools3.dms import DMS
from obitools3.utils cimport tobytes, \
get_obitype, \
update_obitype
from obitools3.dms.capi.obitypes cimport obitype_t, \
OBI_VOID, \
OBI_QUAL
from obitools3.dms.capi.obierrno cimport obi_errno
from obitools3.apps.optiongroups import addSequenceInputOption, addTabularInputOption, addMinimalOutputOption
2017-08-20 17:58:36 +02:00
2017-07-28 12:41:28 +02:00
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
__title__="Imports sequences from different formats into a DMS"
default_config = { 'destview' : None,
'skip' : 0,
'only' : None,
'skiperror' : False,
'seqinformat' : None,
'moltype' : 'nuc',
2017-07-28 12:41:28 +02:00
'source' : None
}
def addOptions(parser):
2017-07-28 12:41:28 +02:00
addSequenceInputOption(parser)
addTabularInputOption(parser)
2017-07-28 12:41:28 +02:00
addMinimalOutputOption(parser)
# addTaxdumpInputOption(parser)
def run(config):
2017-08-20 17:58:36 +02:00
cdef tuple input
cdef tuple output
cdef int i
cdef type value_type
cdef obitype_t value_obitype
cdef obitype_t old_type
cdef obitype_t new_type
cdef bint get_quality
cdef bint NUC_SEQS_view
cdef int nb_elts
cdef object d
cdef View view
cdef object entries
cdef object entry
cdef Column id_col
cdef Column def_col
cdef Column seq_col
cdef Column qual_col
cdef Column old_column
cdef bint rewrite
cdef dict dcols
cdef int skipping
2017-08-20 17:58:36 +02:00
cdef bytes tag
cdef object value
cdef list elt_names
cdef int old_nb_elements_per_line
cdef int new_nb_elements_per_line
cdef list old_elements_names
cdef list new_elements_names
cdef ProgressBar pb
cdef bytes NA_value
global obi_errno
DMS.obi_atexit()
logger("info","obi import : imports file into an DMS")
2018-02-12 14:36:56 +01:00
2017-08-20 17:58:36 +02:00
input = open_uri(config['obi']['inputURI'])
2018-02-12 14:36:56 +01:00
2017-08-20 17:58:36 +02:00
if input[2]==Nuc_Seq:
v = View_NUC_SEQS
else:
2017-08-20 17:58:36 +02:00
v = View
output = open_uri(config['obi']['outputURI'],
input=False,
newviewtype=v)
#quality=get_quality) # TODO
if output is None:
raise Exception("Could not create output view")
2018-02-12 14:36:56 +01:00
#print("input:", input)
#print("output:", output)
2017-08-20 17:58:36 +02:00
pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
2017-07-28 12:41:28 +02:00
entries = input[1]
2017-08-20 17:58:36 +02:00
NA_value = tobytes(config['obi']['nastring']) # TODO
2017-08-20 17:58:36 +02:00
NUC_SEQS_view = False
if isinstance(output[1], View) :
view = output[1]
if output[2] == View_NUC_SEQS :
NUC_SEQS_view = True
else:
raise NotImplementedError()
# Save basic columns in variables for optimization
if NUC_SEQS_view :
id_col = view[b"ID"] # TODO use macros or globals for column names
2017-08-20 17:58:36 +02:00
def_col = view[b"DEFINITION"]
seq_col = view[b"NUC_SEQ"]
dcols = {}
i = 0
for entry in entries :
2017-08-20 17:58:36 +02:00
pb(i)
2017-08-20 17:58:36 +02:00
if NUC_SEQS_view :
# Check if there is a sequencing quality associated # TODO
if i == 0:
get_quality = b"QUALITY" in entry
if get_quality:
Column.new_column(view, b"QUALITY", OBI_QUAL)
qual_col = view[b"QUALITY"]
id_col[i] = entry.id
def_col[i] = entry.definition
seq_col[i] = entry.seq
2017-08-20 17:58:36 +02:00
if get_quality :
qual_col[i] = entry.quality
2017-08-20 17:58:36 +02:00
for tag in entry :
2017-08-20 17:58:36 +02:00
if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm...
value = entry[tag]
2017-08-20 17:58:36 +02:00
# Check NA value
if value == NA_value :
2017-08-20 17:58:36 +02:00
value = None
if tag not in dcols :
value_type = type(value)
nb_elts = 1
value_obitype = OBI_VOID
if value_type == dict or value_type == list :
nb_elts = len(value)
elt_names = list(value)
else :
nb_elts = 1
elt_names = None
value_obitype = get_obitype(value)
if value_obitype != OBI_VOID :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# Fill value
dcols[tag][0][i] = value
# TODO else log error?
else :
rewrite = False
# Check type adequation
old_type = dcols[tag][1]
new_type = OBI_VOID
new_type = update_obitype(old_type, value)
if old_type != new_type :
rewrite = True
try:
# Fill value
dcols[tag][0][i] = value
except IndexError :
value_type = type(value)
old_column = dcols[tag][0]
old_nb_elements_per_line = old_column.nb_elements_per_line
new_nb_elements_per_line = 0
old_elements_names = old_column.elements_names
new_elements_names = None
#####################################################################
# Check the length and keys of column lines if needed
if value_type == dict : # Check dictionary keys
for k in value :
if k not in old_elements_names :
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
rewrite = True
break
elif value_type == list or value_type == tuple : # Check vector length
if old_nb_elements_per_line < len(value) :
new_nb_elements_per_line = len(value)
rewrite = True
#####################################################################
if rewrite :
if new_nb_elements_per_line == 0 and new_elements_names is not None :
new_nb_elements_per_line = len(new_elements_names)
# Reset obierrno
obi_errno = 0
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
new_data_type=new_type,
new_nb_elements_per_line=new_nb_elements_per_line,
new_elements_names=new_elements_names),
value_obitype)
# Update the dictionary:
for t in dcols :
dcols[t] = (view[t], dcols[t][1])
# Fill value
dcols[tag][0][i] = value
i+=1
print("\n")
print(view.__repr__())
input[0].close() # TODO ?
2017-08-20 17:58:36 +02:00
output[0].close()