From 6a2759eee612527b14882f6fc6cd4ea53e38f629 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Sun, 20 Aug 2017 17:58:36 +0200 Subject: [PATCH] obi import with new input/ouput API --- python/obitools3/commands/import.pyx | 306 +++++++++++++-------------- 1 file changed, 149 insertions(+), 157 deletions(-) diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index bd8da66..7443149 100644 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -8,12 +8,10 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.files.universalopener cimport uopen from obitools3.parsers.fasta import fastaIterator from obitools3.parsers.fastq import fastqIterator -from obitools3.dms.dms import DMS # TODO cimport doesn't work from obitools3.dms.view.view cimport View -from obitools3.dms.view.typed_view.view_NUC_SEQS import View_NUC_SEQS # TODO cimport doesn't work +from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.column.column cimport Column - -from obitools3.dms.obiseq import Nuc_Seq +from obitools3.dms.obiseq cimport Nuc_Seq from obitools3.utils cimport tobytes, \ get_obitype, \ @@ -25,6 +23,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \ from obitools3.dms.capi.obierrno cimport obi_errno from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption + from obitools3.uri.decode import open_uri from obitools3.apps.config import logger @@ -50,6 +49,8 @@ def addOptions(parser): def run(config): + cdef tuple input + cdef tuple output cdef int i cdef type value_type cdef obitype_t value_obitype @@ -62,7 +63,6 @@ def run(config): cdef View view cdef object iseq cdef object seq - cdef object inputs cdef Column id_col cdef Column def_col cdef Column seq_col @@ -71,7 +71,7 @@ def run(config): cdef bint rewrite cdef dict dcols cdef int skipping - cdef str tag + cdef bytes tag cdef object value cdef list elt_names cdef int old_nb_elements_per_line @@ -84,165 +84,157 @@ def run(config): logger("info","obi import : imports file into an DMS") - inputs = open_uri(config['obi']['inputURI']) + input = open_uri(config['obi']['inputURI']) - if inputs[2]==Nuc_Seq: + if input[2]==Nuc_Seq: v = View_NUC_SEQS else: - v= View + v = View - print(v) - output = open_uri(config['obi']['outputURI'], input=False, newviewtype=v) - print(input) - print(output) + #print(input) + #print(output) - sys.exit() + pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file -# pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file -# -# inputs = uopen(config['import']['filename']) -# -# # Create or open DMS -# d = DMS.open_or_new(config['obi']['defaultdms']) -# -# get_quality = False -# NUC_SEQS_view = False -# if config['import']['seqinformat']=='fasta': -# get_quality = False -# NUC_SEQS_view = True -# iseq = fastaIterator(inputs, skip=config['import']['skip']) -# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) -# elif config['import']['seqinformat']=='fastq': -# get_quality = True -# NUC_SEQS_view = True -# iseq = fastqIterator(inputs, skip=config['import']['skip']) -# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) -# else: -# raise RuntimeError('File format not handled') -# -# # Save basic columns in variables for optimization -# if NUC_SEQS_view : -# id_col = view["ID"] -# def_col = view["DEFINITION"] -# seq_col = view["NUC_SEQ"] -# if get_quality : -# qual_col = view["QUALITY"] -# -# dcols = {} -# -# i = 0 -# for seq in iseq : -# if i == config['import']['only'] : -# break -# else : -# pb(i) -# if NUC_SEQS_view : -# id_col[i] = seq['id'] -# def_col[i] = seq['definition'] -# seq_col[i] = seq['sequence'] -# if get_quality : -# qual_col[i] = seq['quality'] -# -# for tag in seq['tags'] : -# -# value = seq['tags'][tag] -# -# # Check NA value -# if value == config['import']['NA'] : -# value = None -# -# if tag not in dcols : -# -# value_type = type(value) -# nb_elts = 1 -# value_obitype = OBI_VOID -# -# if value_type == dict or value_type == list : -# nb_elts = len(value) -# elt_names = list(value) -# else : -# nb_elts = 1 -# elt_names = None -# -# value_obitype = get_obitype(value) -# -# if value_obitype != OBI_VOID : -# dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) -# -# # Fill value -# dcols[tag][0][i] = value -# -# # TODO else log error? -# -# else : -# -# rewrite = False -# -# # Check type adequation -# old_type = dcols[tag][1] -# new_type = OBI_VOID -# new_type = update_obitype(old_type, value) -# if old_type != new_type : -# rewrite = True -# -# try: -# # Fill value -# dcols[tag][0][i] = value -# -# except IndexError : -# -# value_type = type(value) -# old_column = dcols[tag][0] -# old_nb_elements_per_line = old_column.nb_elements_per_line -# new_nb_elements_per_line = 0 -# old_elements_names = old_column.elements_names -# new_elements_names = None -# -# ##################################################################### -# -# # Check the length and keys of column lines if needed -# if value_type == dict : # Check dictionary keys -# for k in value : -# if k not in old_elements_names : -# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) -# rewrite = True -# break -# -# elif value_type == list or value_type == tuple : # Check vector length -# if old_nb_elements_per_line < len(value) : -# new_nb_elements_per_line = len(value) -# rewrite = True -# -# ##################################################################### -# -# if rewrite : -# if new_nb_elements_per_line == 0 and new_elements_names is not None : -# new_nb_elements_per_line = len(new_elements_names) -# -# # Reset obierrno -# obi_errno = 0 -# -# dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, -# new_data_type=new_type, -# new_nb_elements_per_line=new_nb_elements_per_line, -# new_elements_names=new_elements_names), -# value_obitype) -# -# # Update the dictionary: -# for t in dcols : -# dcols[t] = (view[t], dcols[t][1]) -# -# # Fill value -# dcols[tag][0][i] = value -# -# i+=1 -# -# print("\n") -# print(view.__repr__()) -# -# d.close() + iseq = input[1] + + get_quality = False + NUC_SEQS_view = False + if isinstance(output[1], View) : + view = output[1] + if output[2] == View_NUC_SEQS : + NUC_SEQS_view = True + if "QUALITY" in view : # TODO + get_quality = True + else: + raise NotImplementedError() + + # Save basic columns in variables for optimization + if NUC_SEQS_view : + id_col = view[b"ID"] + def_col = view[b"DEFINITION"] + seq_col = view[b"NUC_SEQ"] + if get_quality : + qual_col = view[b"QUALITY"] + + dcols = {} + + i = 0 + for seq in iseq : + + pb(i) + + if NUC_SEQS_view : + id_col[i] = seq.id + def_col[i] = seq.definition + seq_col[i] = seq.seq + + if get_quality : + qual_col[i] = seq.quality + + for tag in seq : + + if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm... + + value = seq[tag] + + # Check NA value + if value == config['obi']['nastring'] : + value = None + + if tag not in dcols : + + value_type = type(value) + nb_elts = 1 + value_obitype = OBI_VOID + + if value_type == dict or value_type == list : + nb_elts = len(value) + elt_names = list(value) + else : + nb_elts = 1 + elt_names = None + + value_obitype = get_obitype(value) + + if value_obitype != OBI_VOID : + dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) + + # Fill value + dcols[tag][0][i] = value + + # TODO else log error? + + else : + + rewrite = False + + # Check type adequation + old_type = dcols[tag][1] + new_type = OBI_VOID + new_type = update_obitype(old_type, value) + if old_type != new_type : + rewrite = True + + try: + # Fill value + dcols[tag][0][i] = value + + except IndexError : + + value_type = type(value) + old_column = dcols[tag][0] + old_nb_elements_per_line = old_column.nb_elements_per_line + new_nb_elements_per_line = 0 + old_elements_names = old_column.elements_names + new_elements_names = None + + ##################################################################### + + # Check the length and keys of column lines if needed + if value_type == dict : # Check dictionary keys + for k in value : + if k not in old_elements_names : + new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) + rewrite = True + break + + elif value_type == list or value_type == tuple : # Check vector length + if old_nb_elements_per_line < len(value) : + new_nb_elements_per_line = len(value) + rewrite = True + + ##################################################################### + + if rewrite : + if new_nb_elements_per_line == 0 and new_elements_names is not None : + new_nb_elements_per_line = len(new_elements_names) + + # Reset obierrno + obi_errno = 0 + + dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, + new_data_type=new_type, + new_nb_elements_per_line=new_nb_elements_per_line, + new_elements_names=new_elements_names), + value_obitype) + + # Update the dictionary: + for t in dcols : + dcols[t] = (view[t], dcols[t][1]) + + # Fill value + dcols[tag][0][i] = value + + i+=1 + + print("\n") + print(view.__repr__()) + + input[0].close() # TODO + output[0].close()