obi import with new input/ouput API

This commit is contained in:
Celine Mercier
2017-08-20 17:58:36 +02:00
parent 38029b1f77
commit 6a2759eee6

View File

@ -8,12 +8,10 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.files.universalopener cimport uopen
from obitools3.parsers.fasta import fastaIterator
from obitools3.parsers.fastq import fastqIterator
from obitools3.dms.dms import DMS # TODO cimport doesn't work
from obitools3.dms.view.view cimport View
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS # TODO cimport doesn't work
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column
from obitools3.dms.obiseq import Nuc_Seq
from obitools3.dms.obiseq cimport Nuc_Seq
from obitools3.utils cimport tobytes, \
get_obitype, \
@ -25,6 +23,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
from obitools3.dms.capi.obierrno cimport obi_errno
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
@ -50,6 +49,8 @@ def addOptions(parser):
def run(config):
cdef tuple input
cdef tuple output
cdef int i
cdef type value_type
cdef obitype_t value_obitype
@ -62,7 +63,6 @@ def run(config):
cdef View view
cdef object iseq
cdef object seq
cdef object inputs
cdef Column id_col
cdef Column def_col
cdef Column seq_col
@ -71,7 +71,7 @@ def run(config):
cdef bint rewrite
cdef dict dcols
cdef int skipping
cdef str tag
cdef bytes tag
cdef object value
cdef list elt_names
cdef int old_nb_elements_per_line
@ -84,165 +84,157 @@ def run(config):
logger("info","obi import : imports file into an DMS")
inputs = open_uri(config['obi']['inputURI'])
input = open_uri(config['obi']['inputURI'])
if inputs[2]==Nuc_Seq:
if input[2]==Nuc_Seq:
v = View_NUC_SEQS
else:
v= View
v = View
print(v)
output = open_uri(config['obi']['outputURI'],
input=False,
newviewtype=v)
print(input)
print(output)
#print(input)
#print(output)
sys.exit()
pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
# pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
#
# inputs = uopen(config['import']['filename'])
#
# # Create or open DMS
# d = DMS.open_or_new(config['obi']['defaultdms'])
#
# get_quality = False
# NUC_SEQS_view = False
# if config['import']['seqinformat']=='fasta':
# get_quality = False
# NUC_SEQS_view = True
# iseq = fastaIterator(inputs, skip=config['import']['skip'])
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
# elif config['import']['seqinformat']=='fastq':
# get_quality = True
# NUC_SEQS_view = True
# iseq = fastqIterator(inputs, skip=config['import']['skip'])
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
# else:
# raise RuntimeError('File format not handled')
#
# # Save basic columns in variables for optimization
# if NUC_SEQS_view :
# id_col = view["ID"]
# def_col = view["DEFINITION"]
# seq_col = view["NUC_SEQ"]
# if get_quality :
# qual_col = view["QUALITY"]
#
# dcols = {}
#
# i = 0
# for seq in iseq :
# if i == config['import']['only'] :
# break
# else :
# pb(i)
# if NUC_SEQS_view :
# id_col[i] = seq['id']
# def_col[i] = seq['definition']
# seq_col[i] = seq['sequence']
# if get_quality :
# qual_col[i] = seq['quality']
#
# for tag in seq['tags'] :
#
# value = seq['tags'][tag]
#
# # Check NA value
# if value == config['import']['NA'] :
# value = None
#
# if tag not in dcols :
#
# value_type = type(value)
# nb_elts = 1
# value_obitype = OBI_VOID
#
# if value_type == dict or value_type == list :
# nb_elts = len(value)
# elt_names = list(value)
# else :
# nb_elts = 1
# elt_names = None
#
# value_obitype = get_obitype(value)
#
# if value_obitype != OBI_VOID :
# dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
#
# # Fill value
# dcols[tag][0][i] = value
#
# # TODO else log error?
#
# else :
#
# rewrite = False
#
# # Check type adequation
# old_type = dcols[tag][1]
# new_type = OBI_VOID
# new_type = update_obitype(old_type, value)
# if old_type != new_type :
# rewrite = True
#
# try:
# # Fill value
# dcols[tag][0][i] = value
#
# except IndexError :
#
# value_type = type(value)
# old_column = dcols[tag][0]
# old_nb_elements_per_line = old_column.nb_elements_per_line
# new_nb_elements_per_line = 0
# old_elements_names = old_column.elements_names
# new_elements_names = None
#
# #####################################################################
#
# # Check the length and keys of column lines if needed
# if value_type == dict : # Check dictionary keys
# for k in value :
# if k not in old_elements_names :
# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
# rewrite = True
# break
#
# elif value_type == list or value_type == tuple : # Check vector length
# if old_nb_elements_per_line < len(value) :
# new_nb_elements_per_line = len(value)
# rewrite = True
#
# #####################################################################
#
# if rewrite :
# if new_nb_elements_per_line == 0 and new_elements_names is not None :
# new_nb_elements_per_line = len(new_elements_names)
#
# # Reset obierrno
# obi_errno = 0
#
# dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
# new_data_type=new_type,
# new_nb_elements_per_line=new_nb_elements_per_line,
# new_elements_names=new_elements_names),
# value_obitype)
#
# # Update the dictionary:
# for t in dcols :
# dcols[t] = (view[t], dcols[t][1])
#
# # Fill value
# dcols[tag][0][i] = value
#
# i+=1
#
# print("\n")
# print(view.__repr__())
#
# d.close()
iseq = input[1]
get_quality = False
NUC_SEQS_view = False
if isinstance(output[1], View) :
view = output[1]
if output[2] == View_NUC_SEQS :
NUC_SEQS_view = True
if "QUALITY" in view : # TODO
get_quality = True
else:
raise NotImplementedError()
# Save basic columns in variables for optimization
if NUC_SEQS_view :
id_col = view[b"ID"]
def_col = view[b"DEFINITION"]
seq_col = view[b"NUC_SEQ"]
if get_quality :
qual_col = view[b"QUALITY"]
dcols = {}
i = 0
for seq in iseq :
pb(i)
if NUC_SEQS_view :
id_col[i] = seq.id
def_col[i] = seq.definition
seq_col[i] = seq.seq
if get_quality :
qual_col[i] = seq.quality
for tag in seq :
if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm...
value = seq[tag]
# Check NA value
if value == config['obi']['nastring'] :
value = None
if tag not in dcols :
value_type = type(value)
nb_elts = 1
value_obitype = OBI_VOID
if value_type == dict or value_type == list :
nb_elts = len(value)
elt_names = list(value)
else :
nb_elts = 1
elt_names = None
value_obitype = get_obitype(value)
if value_obitype != OBI_VOID :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# Fill value
dcols[tag][0][i] = value
# TODO else log error?
else :
rewrite = False
# Check type adequation
old_type = dcols[tag][1]
new_type = OBI_VOID
new_type = update_obitype(old_type, value)
if old_type != new_type :
rewrite = True
try:
# Fill value
dcols[tag][0][i] = value
except IndexError :
value_type = type(value)
old_column = dcols[tag][0]
old_nb_elements_per_line = old_column.nb_elements_per_line
new_nb_elements_per_line = 0
old_elements_names = old_column.elements_names
new_elements_names = None
#####################################################################
# Check the length and keys of column lines if needed
if value_type == dict : # Check dictionary keys
for k in value :
if k not in old_elements_names :
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
rewrite = True
break
elif value_type == list or value_type == tuple : # Check vector length
if old_nb_elements_per_line < len(value) :
new_nb_elements_per_line = len(value)
rewrite = True
#####################################################################
if rewrite :
if new_nb_elements_per_line == 0 and new_elements_names is not None :
new_nb_elements_per_line = len(new_elements_names)
# Reset obierrno
obi_errno = 0
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
new_data_type=new_type,
new_nb_elements_per_line=new_nb_elements_per_line,
new_elements_names=new_elements_names),
value_obitype)
# Update the dictionary:
for t in dcols :
dcols[t] = (view[t], dcols[t][1])
# Fill value
dcols[tag][0][i] = value
i+=1
print("\n")
print(view.__repr__())
input[0].close() # TODO
output[0].close()