import: now imports and adds taxids for SILVA and RDP files, added

import of lists, fixed skipping of errors (was not overwriting), and fixed --no-progress-bar option
2021-06-01 09:21:07 +12:00
parent c2f3d90dc1
commit 3e45c34491
4 changed files with 122 additions and 41 deletions
--- a/python/obitools3/apps/optiongroups/init.py
+++ b/python/obitools3/apps/optiongroups/init.py
@ -49,7 +49,13 @@ def __addImportInputOption(optionManager):
                     action="store_const", dest="obi:inputformat",
                     default=None,
                     const=b'silva',
-                     help="Input file is in SILVA fasta format")
+                     help="Input file is in SILVA fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
    group.add_argument('--rdp-input',
                     action="store_const", dest="obi:inputformat",
                     default=None,
                     const=b'rdp',
                     help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
    group.add_argument('--embl-input',
                     action="store_const", dest="obi:inputformat",
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -34,14 +34,17 @@ from obitools3.dms.capi.obidms cimport obi_import_view
 from obitools3.dms.capi.obitypes cimport obitype_t, \
                                         OBI_VOID, \
                                         OBI_QUAL, \
-                                         OBI_STR
+                                         OBI_STR, \
                                         OBI_INT
 from obitools3.dms.capi.obierrno cimport obi_errno
 from obitools3.apps.optiongroups import addImportInputOption, \
                                        addTabularInputOption, \
                                        addTaxdumpInputOption, \
-                                        addMinimalOutputOption
+                                        addMinimalOutputOption, \
                                        addNoProgressBarOption, \
                                        addTaxonomyOption
 from obitools3.uri.decode import open_uri
@ -50,6 +53,7 @@ from obitools3.apps.config import logger
 from cpython.exc cimport PyErr_CheckSignals
 from io import BufferedWriter
 import ast
 __title__="Import sequences from different formats into a DMS"
@ -69,7 +73,9 @@ def addOptions(parser):
    addImportInputOption(parser)
    addTabularInputOption(parser)
    addTaxdumpInputOption(parser)
    addTaxonomyOption(parser)
    addMinimalOutputOption(parser)
    addNoProgressBarOption(parser)
    group = parser.add_argument_group('obi import specific options')
@ -85,6 +91,10 @@ def addOptions(parser):
                     help="If importing a view into another DMS, do it by importing each line, saving disk space if the original view "
                          "has a line selection associated.")
 #    group.add_argument('--only-id',
 #                     action="store", dest="import:onlyid",
 #                     help="only id")
 def run(config):
    cdef   tuple       input
@ -97,6 +107,7 @@ def run(config):
    cdef   bint        get_quality
    cdef   bint        NUC_SEQS_view
    cdef   bint        silva
    cdef   bint        rdp
    cdef   int         nb_elts
    cdef   object      d
    cdef   View        view
@ -180,6 +191,16 @@ def run(config):
        logger("info", "Done.")
        return
    # Open taxonomy if there is one
    if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None:
        taxo_uri = open_uri(config['obi']['taxoURI'])
        if taxo_uri is None or taxo_uri[2] == bytes:
            raise Exception("Couldn't open taxonomy")
        taxo = taxo_uri[1]
    else :
        taxo = None
    # If importing a view between two DMS and not wanting to save space if line selection in original view, use C API
    if isinstance(input[1], View) and not config['import']['space_priority']:
        if obi_import_view(input[0].name_with_full_path, o_dms.name_with_full_path, input[1].name, tobytes((config['obi']['outputURI'].split('/'))[-1])) < 0 :
@ -192,8 +213,11 @@ def run(config):
        logger("info", "Done.")
        return
-    if entry_count >= 0:
+    # Reinitialize the progress bar
    if entry_count >= 0 and config['obi']['noprogressbar'] == False:
        pb = ProgressBar(entry_count, config)
    else:
        pb = None
    NUC_SEQS_view = False
    if isinstance(output[1], View) :
@ -202,20 +226,25 @@ def run(config):
            NUC_SEQS_view = True
    else: 
        raise NotImplementedError()
-        
+
    # Save basic columns in variables for optimization
    if NUC_SEQS_view :
        id_col = view[ID_COLUMN]
        def_col = view[DEFINITION_COLUMN]
        seq_col = view[NUC_SEQUENCE_COLUMN]
-    # Prepare taxon scientific name if SILVA file
+    # Prepare taxon scientific name and taxid refs if RDP or SILVA file
-    if 'inputformat' in config['obi'] and config['obi']['inputformat'] == b"silva":
+    silva = False
    rdp = False
    if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp"):
        #if taxo is None:
        #    raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
        silva = True
-        sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
+        rdp = True
-    else:
+        if taxo is not None:
-        silva = False
+            sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
-    
+            taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
    dcols = {}
    # First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
@ -265,8 +294,13 @@ def run(config):
        # Reinitialize the input
        if isinstance(input[0], CompressedFile):
            input_is_file = True
-        if entry_count >= 0:
+        
        # Reinitialize the progress bar
        if entry_count >= 0 and config['obi']['noprogressbar'] == False:
            pb = ProgressBar(entry_count, config)
        else:
            pb = None
        try:
            input[0].close()
        except AttributeError:
@ -275,6 +309,11 @@ def run(config):
        if input is None:
            raise Exception("Could not open input URI")
 #    if 'onlyid' in config['import']:
 #        onlyid = tobytes(config['import']['onlyid'])
 #    else:
 #        onlyid = None
    entries = input[1]
    i = 0
    for entry in entries :
@ -292,6 +331,9 @@ def run(config):
        elif not i%50000:
            logger("info", "Imported %d entries", i)
 #        if onlyid is not None and entry.id != onlyid:
 #            continue
        try:
            if NUC_SEQS_view: 
@ -307,11 +349,18 @@ def run(config):
                if get_quality:
                    qual_col[i] = entry.quality
-                # Parse taxon scientific name if SILVA file
+                # Parse taxon scientific name if RDP file
-                if silva:
+                if (rdp or silva) and taxo is not None:
-                    sci_name = entry.definition.split(b";")[-1]
+                    sci_names = entry.definition.split(b";")
-                    sci_name_col[i] = sci_name
+                    for sci_name in reversed(sci_names):
-             
+                        if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome' :
                            taxon = taxo.get_taxon_by_name(sci_name)
                            if taxon is not None:
                                sci_name_col[i] = taxon.name
                                taxid_col[i] = taxon.taxid
                                #print(taxid_col[i], sci_name_col[i])
                                break
            for tag in entry :
                if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN :  # TODO dirty 
@ -323,27 +372,39 @@ def run(config):
                        tag = COUNT_COLUMN
                    if tag[:7] == b"merged_":
                        tag = MERGED_PREFIX+tag[7:]
-                        
+
                    if type(value) == bytes and value[:1]==b"[" :
                        try:
                            if type(eval(value)) == list:
                                value = eval(value)
                                #print(value)
                        except:
                            pass
                    if tag not in dcols :
-                         
+                                                
                        value_type = type(value)
                        nb_elts = 1
                        value_obitype = OBI_VOID
                        dict_col = False
-                         
+                        
-                        if value_type == dict or value_type == list :
+                        if value_type == dict :
                            nb_elts = len(value)
                            elt_names = list(value)
-                            if value_type == dict :
+                            dict_col = True
                                dict_col = True
                        else :
                            nb_elts = 1
                            elt_names = None
-                         
+                        
                        if value_type == list :
                            tuples = True
                        else:
                            tuples = False
                        value_obitype = get_obitype(value)
-                         
+                                                
                        if value_obitype != OBI_VOID :
-                            dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names, dict_column=dict_col), value_obitype)
+                            dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names, dict_column=dict_col, tuples=tuples), value_obitype)
                            # Fill value
                            dcols[tag][0][i] = value
@ -419,11 +480,12 @@ def run(config):
                                dcols[tag][0][i] = value
        except Exception as e:
-            print("\nCould not import sequence:", entry, "(error raised:", e, ")")
+            print("\nCould not import sequence:", repr(entry), "(error raised:", e, ")")
            if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
                raise e
            else:
                pass
                i-=1  # overwrite problematic entry
        i+=1
--- a/python/obitools3/utils.pxd
+++ b/python/obitools3/utils.pxd
@ -18,7 +18,7 @@ cdef object clean_empty_values_from_object(object value, exclude=*)
 cdef obitype_t get_obitype_single_value(object value)
 cdef obitype_t update_obitype(obitype_t obitype, object new_value)
-cdef obitype_t get_obitype_iterable_value(object value)
+cdef obitype_t get_obitype_iterable_value(object value, type t)
 cdef obitype_t get_obitype(object value)
 cdef object __etag__(bytes x, bytes nastring=*)
--- a/python/obitools3/utils.pyx
+++ b/python/obitools3/utils.pyx
@ -260,38 +260,51 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value) :
    new_type = type(new_value)
-    if obitype == OBI_INT :
+    #if new_type == NoneType:  # doesn't work because Cython sucks
-        if new_type == float or new_value > OBI_INT_MAX :
+    if new_value == None or new_type==list or new_type==dict or new_type==tuple:
-            return OBI_FLOAT
+        return obitype
    # TODO BOOL vers INT/FLOAT
-    elif new_type == str or new_type == bytes :
+    if new_type == str or new_type == bytes :
        if obitype == OBI_SEQ and is_a_DNA_seq(tobytes(new_value)) :
            pass
        else :
            return OBI_STR
-    
+    elif obitype == OBI_INT :
        if new_type == float or new_value > OBI_INT_MAX :
            return OBI_FLOAT
    return obitype
-cdef obitype_t get_obitype_iterable_value(object value) :
+cdef obitype_t get_obitype_iterable_value(object value, type t) :
    cdef obitype_t value_obitype
    value_obitype = OBI_VOID
-    for k in value :
+    if t == dict:
-        if value_obitype == OBI_VOID :
+        for k in value :
-            value_obitype = get_obitype_single_value(value[k])
+            if value_obitype == OBI_VOID :
-        else :
+                value_obitype = get_obitype_single_value(value[k])
-            value_obitype = update_obitype(value_obitype, value[k])
+            else :
                value_obitype = update_obitype(value_obitype, value[k])
    elif t == list or t == tuple:
        for v in value :
            if value_obitype == OBI_VOID :
                value_obitype = get_obitype_single_value(v)
            else :
                value_obitype = update_obitype(value_obitype, v)
    return value_obitype
 cdef obitype_t get_obitype(object value) :
-    if type(value) == dict or type(value) == list or type(value) == tuple :
+    t = type(value)
-        return get_obitype_iterable_value(value)
+    if t == dict or t == list or t == tuple :
        return get_obitype_iterable_value(value, t)
    else :
        return get_obitype_single_value(value)