obi import with new input/ouput API

This commit is contained in:
Celine Mercier
2017-08-20 17:58:36 +02:00
parent 38029b1f77
commit 6a2759eee6

View File

@ -8,12 +8,10 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.files.universalopener cimport uopen from obitools3.files.universalopener cimport uopen
from obitools3.parsers.fasta import fastaIterator from obitools3.parsers.fasta import fastaIterator
from obitools3.parsers.fastq import fastqIterator from obitools3.parsers.fastq import fastqIterator
from obitools3.dms.dms import DMS # TODO cimport doesn't work
from obitools3.dms.view.view cimport View from obitools3.dms.view.view cimport View
from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS # TODO cimport doesn't work from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column from obitools3.dms.column.column cimport Column
from obitools3.dms.obiseq cimport Nuc_Seq
from obitools3.dms.obiseq import Nuc_Seq
from obitools3.utils cimport tobytes, \ from obitools3.utils cimport tobytes, \
get_obitype, \ get_obitype, \
@ -25,6 +23,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
from obitools3.dms.capi.obierrno cimport obi_errno from obitools3.dms.capi.obierrno cimport obi_errno
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger from obitools3.apps.config import logger
@ -50,6 +49,8 @@ def addOptions(parser):
def run(config): def run(config):
cdef tuple input
cdef tuple output
cdef int i cdef int i
cdef type value_type cdef type value_type
cdef obitype_t value_obitype cdef obitype_t value_obitype
@ -62,7 +63,6 @@ def run(config):
cdef View view cdef View view
cdef object iseq cdef object iseq
cdef object seq cdef object seq
cdef object inputs
cdef Column id_col cdef Column id_col
cdef Column def_col cdef Column def_col
cdef Column seq_col cdef Column seq_col
@ -71,7 +71,7 @@ def run(config):
cdef bint rewrite cdef bint rewrite
cdef dict dcols cdef dict dcols
cdef int skipping cdef int skipping
cdef str tag cdef bytes tag
cdef object value cdef object value
cdef list elt_names cdef list elt_names
cdef int old_nb_elements_per_line cdef int old_nb_elements_per_line
@ -84,165 +84,157 @@ def run(config):
logger("info","obi import : imports file into an DMS") logger("info","obi import : imports file into an DMS")
inputs = open_uri(config['obi']['inputURI']) input = open_uri(config['obi']['inputURI'])
if inputs[2]==Nuc_Seq: if input[2]==Nuc_Seq:
v = View_NUC_SEQS v = View_NUC_SEQS
else: else:
v= View v = View
print(v)
output = open_uri(config['obi']['outputURI'], output = open_uri(config['obi']['outputURI'],
input=False, input=False,
newviewtype=v) newviewtype=v)
print(input) #print(input)
print(output) #print(output)
sys.exit() pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
# pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file iseq = input[1]
#
# inputs = uopen(config['import']['filename']) get_quality = False
# NUC_SEQS_view = False
# # Create or open DMS if isinstance(output[1], View) :
# d = DMS.open_or_new(config['obi']['defaultdms']) view = output[1]
# if output[2] == View_NUC_SEQS :
# get_quality = False NUC_SEQS_view = True
# NUC_SEQS_view = False if "QUALITY" in view : # TODO
# if config['import']['seqinformat']=='fasta': get_quality = True
# get_quality = False else:
# NUC_SEQS_view = True raise NotImplementedError()
# iseq = fastaIterator(inputs, skip=config['import']['skip'])
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) # Save basic columns in variables for optimization
# elif config['import']['seqinformat']=='fastq': if NUC_SEQS_view :
# get_quality = True id_col = view[b"ID"]
# NUC_SEQS_view = True def_col = view[b"DEFINITION"]
# iseq = fastqIterator(inputs, skip=config['import']['skip']) seq_col = view[b"NUC_SEQ"]
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) if get_quality :
# else: qual_col = view[b"QUALITY"]
# raise RuntimeError('File format not handled')
# dcols = {}
# # Save basic columns in variables for optimization
# if NUC_SEQS_view : i = 0
# id_col = view["ID"] for seq in iseq :
# def_col = view["DEFINITION"]
# seq_col = view["NUC_SEQ"] pb(i)
# if get_quality :
# qual_col = view["QUALITY"] if NUC_SEQS_view :
# id_col[i] = seq.id
# dcols = {} def_col[i] = seq.definition
# seq_col[i] = seq.seq
# i = 0
# for seq in iseq : if get_quality :
# if i == config['import']['only'] : qual_col[i] = seq.quality
# break
# else : for tag in seq :
# pb(i)
# if NUC_SEQS_view : if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm...
# id_col[i] = seq['id']
# def_col[i] = seq['definition'] value = seq[tag]
# seq_col[i] = seq['sequence']
# if get_quality : # Check NA value
# qual_col[i] = seq['quality'] if value == config['obi']['nastring'] :
# value = None
# for tag in seq['tags'] :
# if tag not in dcols :
# value = seq['tags'][tag]
# value_type = type(value)
# # Check NA value nb_elts = 1
# if value == config['import']['NA'] : value_obitype = OBI_VOID
# value = None
# if value_type == dict or value_type == list :
# if tag not in dcols : nb_elts = len(value)
# elt_names = list(value)
# value_type = type(value) else :
# nb_elts = 1 nb_elts = 1
# value_obitype = OBI_VOID elt_names = None
#
# if value_type == dict or value_type == list : value_obitype = get_obitype(value)
# nb_elts = len(value)
# elt_names = list(value) if value_obitype != OBI_VOID :
# else : dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# nb_elts = 1
# elt_names = None # Fill value
# dcols[tag][0][i] = value
# value_obitype = get_obitype(value)
# # TODO else log error?
# if value_obitype != OBI_VOID :
# dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) else :
#
# # Fill value rewrite = False
# dcols[tag][0][i] = value
# # Check type adequation
# # TODO else log error? old_type = dcols[tag][1]
# new_type = OBI_VOID
# else : new_type = update_obitype(old_type, value)
# if old_type != new_type :
# rewrite = False rewrite = True
#
# # Check type adequation try:
# old_type = dcols[tag][1] # Fill value
# new_type = OBI_VOID dcols[tag][0][i] = value
# new_type = update_obitype(old_type, value)
# if old_type != new_type : except IndexError :
# rewrite = True
# value_type = type(value)
# try: old_column = dcols[tag][0]
# # Fill value old_nb_elements_per_line = old_column.nb_elements_per_line
# dcols[tag][0][i] = value new_nb_elements_per_line = 0
# old_elements_names = old_column.elements_names
# except IndexError : new_elements_names = None
#
# value_type = type(value) #####################################################################
# old_column = dcols[tag][0]
# old_nb_elements_per_line = old_column.nb_elements_per_line # Check the length and keys of column lines if needed
# new_nb_elements_per_line = 0 if value_type == dict : # Check dictionary keys
# old_elements_names = old_column.elements_names for k in value :
# new_elements_names = None if k not in old_elements_names :
# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
# ##################################################################### rewrite = True
# break
# # Check the length and keys of column lines if needed
# if value_type == dict : # Check dictionary keys elif value_type == list or value_type == tuple : # Check vector length
# for k in value : if old_nb_elements_per_line < len(value) :
# if k not in old_elements_names : new_nb_elements_per_line = len(value)
# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) rewrite = True
# rewrite = True
# break #####################################################################
#
# elif value_type == list or value_type == tuple : # Check vector length if rewrite :
# if old_nb_elements_per_line < len(value) : if new_nb_elements_per_line == 0 and new_elements_names is not None :
# new_nb_elements_per_line = len(value) new_nb_elements_per_line = len(new_elements_names)
# rewrite = True
# # Reset obierrno
# ##################################################################### obi_errno = 0
#
# if rewrite : dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
# if new_nb_elements_per_line == 0 and new_elements_names is not None : new_data_type=new_type,
# new_nb_elements_per_line = len(new_elements_names) new_nb_elements_per_line=new_nb_elements_per_line,
# new_elements_names=new_elements_names),
# # Reset obierrno value_obitype)
# obi_errno = 0
# # Update the dictionary:
# dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, for t in dcols :
# new_data_type=new_type, dcols[t] = (view[t], dcols[t][1])
# new_nb_elements_per_line=new_nb_elements_per_line,
# new_elements_names=new_elements_names), # Fill value
# value_obitype) dcols[tag][0][i] = value
#
# # Update the dictionary: i+=1
# for t in dcols :
# dcols[t] = (view[t], dcols[t][1]) print("\n")
# print(view.__repr__())
# # Fill value
# dcols[tag][0][i] = value input[0].close() # TODO
# output[0].close()
# i+=1
#
# print("\n")
# print(view.__repr__())
#
# d.close()