obi import: skip on error more robust

2020-05-28 20:40:36 +02:00
parent a7dcf16c06
commit 6094ce2bbc
1 changed files with 122 additions and 113 deletions
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -270,124 +270,133 @@ def run(config):
        elif not i%50000:
            logger("info", "Imported %d entries", i)
-        if NUC_SEQS_view: 
+        try:
-            id_col[i] = entry.id
+             
-            def_col[i] = entry.definition
+            if NUC_SEQS_view: 
-            seq_col[i] = entry.seq
+                id_col[i] = entry.id
-            # Check if there is a sequencing quality associated by checking the first entry    # TODO haven't found a more robust solution yet
+                def_col[i] = entry.definition
-            if i == 0:
+                seq_col[i] = entry.seq
-                get_quality = QUALITY_COLUMN in entry
+                # Check if there is a sequencing quality associated by checking the first entry    # TODO haven't found a more robust solution yet
                if i == 0:
                    get_quality = QUALITY_COLUMN in entry
                    if get_quality:
                        Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
                        qual_col = view[QUALITY_COLUMN]
                if get_quality:
-                    Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
+                    qual_col[i] = entry.quality
                    qual_col = view[QUALITY_COLUMN]
            if get_quality:
                qual_col[i] = entry.quality
-        for tag in entry :
+            for tag in entry :
-            if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN :  # TODO dirty 
+                if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN :  # TODO dirty 
-                value = entry[tag]
+                    value = entry[tag]
-                if tag == b"taxid":
+                    if tag == b"taxid":
-                    tag = TAXID_COLUMN
+                        tag = TAXID_COLUMN
-                if tag == b"count":
+                    if tag == b"count":
-                    tag = COUNT_COLUMN
+                        tag = COUNT_COLUMN
-                if tag[:7] == b"merged_":
+                    if tag[:7] == b"merged_":
-                    tag = MERGED_PREFIX+tag[7:]
+                        tag = MERGED_PREFIX+tag[7:]
-                if tag not in dcols :
+                    if tag not in dcols :
                    value_type = type(value)
                    nb_elts = 1
                    value_obitype = OBI_VOID
                    if value_type == dict or value_type == list :
                        nb_elts = len(value)
                        elt_names = list(value)
                    else :
                        nb_elts = 1
                        elt_names = None
                    value_obitype = get_obitype(value)
                    if value_obitype != OBI_VOID :
                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
                        # Fill value
                        if value_type == dict and nb_elts == 1:  # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
                            value = value[list(value.keys())[0]]       # The solution is to transform the value in a simple atomic one acceptable by the column
                        dcols[tag][0][i] = value
                    # TODO else log error?
                else :
                    rewrite = False
                    # Check type adequation
                    old_type = dcols[tag][1]
                    new_type = OBI_VOID
                    new_type = update_obitype(old_type, value)
                    if old_type != new_type :
                        rewrite = True
                    try:
                        # Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key                        
                        if type(value) == dict and \
                            dcols[tag][0].nb_elements_per_line == 1 \
                            and set(dcols[tag][0].elements_names) != set(value.keys()) :
                            raise IndexError  # trigger column rewrite
                        # Fill value
                        dcols[tag][0][i] = value
                    except IndexError :
                        value_type = type(value)
-                        old_column = dcols[tag][0]
+                        nb_elts = 1
-                        old_nb_elements_per_line = old_column.nb_elements_per_line
+                        value_obitype = OBI_VOID
                        new_nb_elements_per_line = 0
                        old_elements_names = old_column.elements_names
                        new_elements_names = None
-                        #####################################################################
+                        if value_type == dict or value_type == list :
                            nb_elts = len(value)
                            elt_names = list(value)
                        else :
                            nb_elts = 1
                            elt_names = None
-                        # Check the length and keys of column lines if needed
+                        value_obitype = get_obitype(value)
                        if value_type == dict :    # Check dictionary keys
                            for k in value :
                                if k not in old_elements_names :
                                    new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
                                    rewrite = True
                                    break
-                        elif value_type == list or value_type == tuple :  # Check vector length
+                        if value_obitype != OBI_VOID :
-                            if old_nb_elements_per_line < len(value) :
+                            dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
                                new_nb_elements_per_line = len(value)
                                rewrite = True
-                        #####################################################################
+                            # Fill value
                            if value_type == dict and nb_elts == 1:  # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
                                value = value[list(value.keys())[0]]       # The solution is to transform the value in a simple atomic one acceptable by the column
                            dcols[tag][0][i] = value
-                        if rewrite :
+                        # TODO else log error?
                            if new_nb_elements_per_line == 0 and new_elements_names is not None :
                                new_nb_elements_per_line = len(new_elements_names)
-                            # Reset obierrno 
+                    else :
                            obi_errno = 0
-                            dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
+                        rewrite = False
                                                                                   new_data_type=new_type, 
                                                                                   new_nb_elements_per_line=new_nb_elements_per_line,
                                                                                   new_elements_names=new_elements_names,
                                                                                   rewrite_last_line=False), 
                                          new_type)
-                            # Update the dictionary:
+                        # Check type adequation
-                            for t in dcols :
+                        old_type = dcols[tag][1]
-                                dcols[t] = (view[t], dcols[t][1])
+                        new_type = OBI_VOID
                        new_type = update_obitype(old_type, value)
                        if old_type != new_type :
                            rewrite = True
                        try:
                            # Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key                        
                            if type(value) == dict and \
                                dcols[tag][0].nb_elements_per_line == 1 \
                                and set(dcols[tag][0].elements_names) != set(value.keys()) :
                                raise IndexError  # trigger column rewrite
                            # Fill value
                            dcols[tag][0][i] = value
                        except IndexError :
                            value_type = type(value)
                            old_column = dcols[tag][0]
                            old_nb_elements_per_line = old_column.nb_elements_per_line
                            new_nb_elements_per_line = 0
                            old_elements_names = old_column.elements_names
                            new_elements_names = None
                            #####################################################################
                            # Check the length and keys of column lines if needed
                            if value_type == dict :    # Check dictionary keys
                                for k in value :
                                    if k not in old_elements_names :
                                        new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
                                        rewrite = True
                                        break
                            elif value_type == list or value_type == tuple :  # Check vector length
                                if old_nb_elements_per_line < len(value) :
                                    new_nb_elements_per_line = len(value)
                                    rewrite = True
                            #####################################################################
                            if rewrite :
                                if new_nb_elements_per_line == 0 and new_elements_names is not None :
                                    new_nb_elements_per_line = len(new_elements_names)
                                # Reset obierrno 
                                obi_errno = 0
                                dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
                                                                                       new_data_type=new_type, 
                                                                                       new_nb_elements_per_line=new_nb_elements_per_line,
                                                                                       new_elements_names=new_elements_names,
                                                                                       rewrite_last_line=False), 
                                              new_type)
                                # Update the dictionary:
                                for t in dcols :
                                    dcols[t] = (view[t], dcols[t][1])
                                # Fill value
                                dcols[tag][0][i] = value
        except Exception as e:
            print("\nCould not import sequence id:", entry.id, "(error raised:", e, ")")
            if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
                raise e
            else:
                pass
        i+=1
    if pb is not None: