diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index 1160870..0f1897f 100755 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -269,125 +269,134 @@ def run(config): pb(i) elif not i%50000: logger("info", "Imported %d entries", i) - - if NUC_SEQS_view: - id_col[i] = entry.id - def_col[i] = entry.definition - seq_col[i] = entry.seq - # Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet - if i == 0: - get_quality = QUALITY_COLUMN in entry + + try: + + if NUC_SEQS_view: + id_col[i] = entry.id + def_col[i] = entry.definition + seq_col[i] = entry.seq + # Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet + if i == 0: + get_quality = QUALITY_COLUMN in entry + if get_quality: + Column.new_column(view, QUALITY_COLUMN, OBI_QUAL) + qual_col = view[QUALITY_COLUMN] if get_quality: - Column.new_column(view, QUALITY_COLUMN, OBI_QUAL) - qual_col = view[QUALITY_COLUMN] - if get_quality: - qual_col[i] = entry.quality - - for tag in entry : - - if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty - - value = entry[tag] - if tag == b"taxid": - tag = TAXID_COLUMN - if tag == b"count": - tag = COUNT_COLUMN - if tag[:7] == b"merged_": - tag = MERGED_PREFIX+tag[7:] - - if tag not in dcols : - - value_type = type(value) - nb_elts = 1 - value_obitype = OBI_VOID - - if value_type == dict or value_type == list : - nb_elts = len(value) - elt_names = list(value) - else : - nb_elts = 1 - elt_names = None - - value_obitype = get_obitype(value) - - if value_obitype != OBI_VOID : - dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) - - # Fill value - if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value - value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column - dcols[tag][0][i] = value - - # TODO else log error? - - else : - - rewrite = False - - # Check type adequation - old_type = dcols[tag][1] - new_type = OBI_VOID - new_type = update_obitype(old_type, value) - if old_type != new_type : - rewrite = True - - try: - # Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key - if type(value) == dict and \ - dcols[tag][0].nb_elements_per_line == 1 \ - and set(dcols[tag][0].elements_names) != set(value.keys()) : - raise IndexError # trigger column rewrite + qual_col[i] = entry.quality + + for tag in entry : + + if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty + + value = entry[tag] + if tag == b"taxid": + tag = TAXID_COLUMN + if tag == b"count": + tag = COUNT_COLUMN + if tag[:7] == b"merged_": + tag = MERGED_PREFIX+tag[7:] - # Fill value - dcols[tag][0][i] = value - - except IndexError : - + if tag not in dcols : + value_type = type(value) - old_column = dcols[tag][0] - old_nb_elements_per_line = old_column.nb_elements_per_line - new_nb_elements_per_line = 0 - old_elements_names = old_column.elements_names - new_elements_names = None + nb_elts = 1 + value_obitype = OBI_VOID + + if value_type == dict or value_type == list : + nb_elts = len(value) + elt_names = list(value) + else : + nb_elts = 1 + elt_names = None + + value_obitype = get_obitype(value) + + if value_obitype != OBI_VOID : + dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) + + # Fill value + if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value + value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column + dcols[tag][0][i] = value + + # TODO else log error? - ##################################################################### - - # Check the length and keys of column lines if needed - if value_type == dict : # Check dictionary keys - for k in value : - if k not in old_elements_names : - new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) - rewrite = True - break - - elif value_type == list or value_type == tuple : # Check vector length - if old_nb_elements_per_line < len(value) : - new_nb_elements_per_line = len(value) - rewrite = True - - ##################################################################### - - if rewrite : - if new_nb_elements_per_line == 0 and new_elements_names is not None : - new_nb_elements_per_line = len(new_elements_names) - - # Reset obierrno - obi_errno = 0 - - dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, - new_data_type=new_type, - new_nb_elements_per_line=new_nb_elements_per_line, - new_elements_names=new_elements_names, - rewrite_last_line=False), - new_type) - - # Update the dictionary: - for t in dcols : - dcols[t] = (view[t], dcols[t][1]) - + else : + + rewrite = False + + # Check type adequation + old_type = dcols[tag][1] + new_type = OBI_VOID + new_type = update_obitype(old_type, value) + if old_type != new_type : + rewrite = True + + try: + # Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key + if type(value) == dict and \ + dcols[tag][0].nb_elements_per_line == 1 \ + and set(dcols[tag][0].elements_names) != set(value.keys()) : + raise IndexError # trigger column rewrite + # Fill value dcols[tag][0][i] = value - + + except IndexError : + + value_type = type(value) + old_column = dcols[tag][0] + old_nb_elements_per_line = old_column.nb_elements_per_line + new_nb_elements_per_line = 0 + old_elements_names = old_column.elements_names + new_elements_names = None + + ##################################################################### + + # Check the length and keys of column lines if needed + if value_type == dict : # Check dictionary keys + for k in value : + if k not in old_elements_names : + new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) + rewrite = True + break + + elif value_type == list or value_type == tuple : # Check vector length + if old_nb_elements_per_line < len(value) : + new_nb_elements_per_line = len(value) + rewrite = True + + ##################################################################### + + if rewrite : + if new_nb_elements_per_line == 0 and new_elements_names is not None : + new_nb_elements_per_line = len(new_elements_names) + + # Reset obierrno + obi_errno = 0 + + dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, + new_data_type=new_type, + new_nb_elements_per_line=new_nb_elements_per_line, + new_elements_names=new_elements_names, + rewrite_last_line=False), + new_type) + + # Update the dictionary: + for t in dcols : + dcols[t] = (view[t], dcols[t][1]) + + # Fill value + dcols[tag][0][i] = value + + except Exception as e: + print("\nCould not import sequence id:", entry.id, "(error raised:", e, ")") + if 'skiperror' in config['obi'] and not config['obi']['skiperror']: + raise e + else: + pass + i+=1 if pb is not None: