import: now imports and adds taxids for SILVA and RDP files, added

import of lists, fixed skipping of errors (was not overwriting), and fixed --no-progress-bar option
2021-06-01 09:21:07 +12:00
parent c2f3d90dc1
commit 3e45c34491
4 changed files with 122 additions and 41 deletions
--- a/python/obitools3/apps/optiongroups/init.py
+++ b/python/obitools3/apps/optiongroups/init.py
@ -49,7 +49,13 @@ def __addImportInputOption(optionManager):
                     action="store_const", dest="obi:inputformat",
                     default=None,
                     const=b'silva',
-                     help="Input file is in SILVA fasta format")
+                     help="Input file is in SILVA fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
+
+    group.add_argument('--rdp-input',
+                     action="store_const", dest="obi:inputformat",
+                     default=None,
+                     const=b'rdp',
+                     help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")

    group.add_argument('--embl-input',
                     action="store_const", dest="obi:inputformat",
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -34,14 +34,17 @@ from obitools3.dms.capi.obidms cimport obi_import_view
 from obitools3.dms.capi.obitypes cimport obitype_t, \
                                         OBI_VOID, \
                                         OBI_QUAL, \
-                                         OBI_STR
+                                         OBI_STR, \
+                                         OBI_INT

 from obitools3.dms.capi.obierrno cimport obi_errno

 from obitools3.apps.optiongroups import addImportInputOption, \
                                        addTabularInputOption, \
                                        addTaxdumpInputOption, \
-                                        addMinimalOutputOption
+                                        addMinimalOutputOption, \
+                                        addNoProgressBarOption, \
+                                        addTaxonomyOption

 from obitools3.uri.decode import open_uri

@ -50,6 +53,7 @@ from obitools3.apps.config import logger
 from cpython.exc cimport PyErr_CheckSignals

 from io import BufferedWriter
+import ast


 __title__="Import sequences from different formats into a DMS"
@ -69,7 +73,9 @@ def addOptions(parser):
    addImportInputOption(parser)
    addTabularInputOption(parser)
    addTaxdumpInputOption(parser)
+    addTaxonomyOption(parser)
    addMinimalOutputOption(parser)
+    addNoProgressBarOption(parser)

    group = parser.add_argument_group('obi import specific options')

@ -85,6 +91,10 @@ def addOptions(parser):
                     help="If importing a view into another DMS, do it by importing each line, saving disk space if the original view "
                          "has a line selection associated.")

+#    group.add_argument('--only-id',
+#                     action="store", dest="import:onlyid",
+#                     help="only id")
+
 def run(config):
    
    cdef   tuple       input
@ -97,6 +107,7 @@ def run(config):
    cdef   bint        get_quality
    cdef   bint        NUC_SEQS_view
    cdef   bint        silva
+    cdef   bint        rdp
    cdef   int         nb_elts
    cdef   object      d
    cdef   View        view
@ -180,6 +191,16 @@ def run(config):
        logger("info", "Done.")
        return

+    # Open taxonomy if there is one
+    if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None:
+        taxo_uri = open_uri(config['obi']['taxoURI'])
+        if taxo_uri is None or taxo_uri[2] == bytes:
+            raise Exception("Couldn't open taxonomy")
+        taxo = taxo_uri[1]
+
+    else :
+        taxo = None
+
    # If importing a view between two DMS and not wanting to save space if line selection in original view, use C API
    if isinstance(input[1], View) and not config['import']['space_priority']:
        if obi_import_view(input[0].name_with_full_path, o_dms.name_with_full_path, input[1].name, tobytes((config['obi']['outputURI'].split('/'))[-1])) < 0 :
@ -192,8 +213,11 @@ def run(config):
        logger("info", "Done.")
        return

-    if entry_count >= 0:
+    # Reinitialize the progress bar
+    if entry_count >= 0 and config['obi']['noprogressbar'] == False:
        pb = ProgressBar(entry_count, config)
+    else:
+        pb = None
        
    NUC_SEQS_view = False
    if isinstance(output[1], View) :
@ -202,20 +226,25 @@ def run(config):
            NUC_SEQS_view = True
    else: 
        raise NotImplementedError()
-        
+
    # Save basic columns in variables for optimization
    if NUC_SEQS_view :
        id_col = view[ID_COLUMN]
        def_col = view[DEFINITION_COLUMN]
        seq_col = view[NUC_SEQUENCE_COLUMN]

-    # Prepare taxon scientific name if SILVA file
-    if 'inputformat' in config['obi'] and config['obi']['inputformat'] == b"silva":
+    # Prepare taxon scientific name and taxid refs if RDP or SILVA file
+    silva = False
+    rdp = False
+    if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp"):
+        #if taxo is None:
+        #    raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
        silva = True
-        sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
-    else:
-        silva = False
-    
+        rdp = True
+        if taxo is not None:
+            sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
+            taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
+
    dcols = {}

    # First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
@ -265,8 +294,13 @@ def run(config):
        # Reinitialize the input
        if isinstance(input[0], CompressedFile):
            input_is_file = True
-        if entry_count >= 0:
+        
+        # Reinitialize the progress bar
+        if entry_count >= 0 and config['obi']['noprogressbar'] == False:
            pb = ProgressBar(entry_count, config)
+        else:
+            pb = None
+
        try:
            input[0].close()
        except AttributeError:
@ -275,6 +309,11 @@ def run(config):
        if input is None:
            raise Exception("Could not open input URI")
    
+#    if 'onlyid' in config['import']:
+#        onlyid = tobytes(config['import']['onlyid'])
+#    else:
+#        onlyid = None
+    
    entries = input[1]
    i = 0
    for entry in entries :
@ -292,6 +331,9 @@ def run(config):
        elif not i%50000:
            logger("info", "Imported %d entries", i)
        
+#        if onlyid is not None and entry.id != onlyid:
+#            continue
+        
        try:
             
            if NUC_SEQS_view: 
@ -307,11 +349,18 @@ def run(config):
                if get_quality:
                    qual_col[i] = entry.quality
                    
-                # Parse taxon scientific name if SILVA file
-                if silva:
-                    sci_name = entry.definition.split(b";")[-1]
-                    sci_name_col[i] = sci_name
-             
+                # Parse taxon scientific name if RDP file
+                if (rdp or silva) and taxo is not None:
+                    sci_names = entry.definition.split(b";")
+                    for sci_name in reversed(sci_names):
+                        if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome' :
+                            taxon = taxo.get_taxon_by_name(sci_name)
+                            if taxon is not None:
+                                sci_name_col[i] = taxon.name
+                                taxid_col[i] = taxon.taxid
+                                #print(taxid_col[i], sci_name_col[i])
+                                break
+
            for tag in entry :
                
                if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN :  # TODO dirty 
@ -323,27 +372,39 @@ def run(config):
                        tag = COUNT_COLUMN
                    if tag[:7] == b"merged_":
                        tag = MERGED_PREFIX+tag[7:]
-                        
+
+                    if type(value) == bytes and value[:1]==b"[" :
+                        try:
+                            if type(eval(value)) == list:
+                                value = eval(value)
+                                #print(value)
+                        except:
+                            pass
+                    
                    if tag not in dcols :
-                         
+                                                
                        value_type = type(value)
                        nb_elts = 1
                        value_obitype = OBI_VOID
                        dict_col = False
-                         
-                        if value_type == dict or value_type == list :
+                        
+                        if value_type == dict :
                            nb_elts = len(value)
                            elt_names = list(value)
-                            if value_type == dict :
-                                dict_col = True
+                            dict_col = True
                        else :
                            nb_elts = 1
                            elt_names = None
-                         
+                        
+                        if value_type == list :
+                            tuples = True
+                        else:
+                            tuples = False
+                        
                        value_obitype = get_obitype(value)
-                         
+                                                
                        if value_obitype != OBI_VOID :
-                            dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names, dict_column=dict_col), value_obitype)
+                            dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names, dict_column=dict_col, tuples=tuples), value_obitype)
                                                     
                            # Fill value
                            dcols[tag][0][i] = value
@ -419,11 +480,12 @@ def run(config):
                                dcols[tag][0][i] = value
        
        except Exception as e:
-            print("\nCould not import sequence:", entry, "(error raised:", e, ")")
+            print("\nCould not import sequence:", repr(entry), "(error raised:", e, ")")
            if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
                raise e
            else:
                pass
+                i-=1  # overwrite problematic entry
        
        i+=1
 
--- a/python/obitools3/utils.pxd
+++ b/python/obitools3/utils.pxd
@ -18,7 +18,7 @@ cdef object clean_empty_values_from_object(object value, exclude=*)

 cdef obitype_t get_obitype_single_value(object value)
 cdef obitype_t update_obitype(obitype_t obitype, object new_value)
-cdef obitype_t get_obitype_iterable_value(object value)
+cdef obitype_t get_obitype_iterable_value(object value, type t)
 cdef obitype_t get_obitype(object value)

 cdef object __etag__(bytes x, bytes nastring=*)
--- a/python/obitools3/utils.pyx
+++ b/python/obitools3/utils.pyx
@ -260,38 +260,51 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value) :
    
    new_type = type(new_value)
    
-    if obitype == OBI_INT :
-        if new_type == float or new_value > OBI_INT_MAX :
-            return OBI_FLOAT
+    #if new_type == NoneType:  # doesn't work because Cython sucks
+    if new_value == None or new_type==list or new_type==dict or new_type==tuple:
+        return obitype
+    
    # TODO BOOL vers INT/FLOAT
-    elif new_type == str or new_type == bytes :
+    if new_type == str or new_type == bytes :
        if obitype == OBI_SEQ and is_a_DNA_seq(tobytes(new_value)) :
            pass
        else :
            return OBI_STR
-    
+    elif obitype == OBI_INT :
+        if new_type == float or new_value > OBI_INT_MAX :
+            return OBI_FLOAT
+        
    return obitype


-cdef obitype_t get_obitype_iterable_value(object value) :
+cdef obitype_t get_obitype_iterable_value(object value, type t) :
    
    cdef obitype_t value_obitype
    
    value_obitype = OBI_VOID
    
-    for k in value :
-        if value_obitype == OBI_VOID :
-            value_obitype = get_obitype_single_value(value[k])
-        else :
-            value_obitype = update_obitype(value_obitype, value[k])
+    if t == dict:
+        for k in value :
+            if value_obitype == OBI_VOID :
+                value_obitype = get_obitype_single_value(value[k])
+            else :
+                value_obitype = update_obitype(value_obitype, value[k])
+    
+    elif t == list or t == tuple:
+        for v in value :
+            if value_obitype == OBI_VOID :
+                value_obitype = get_obitype_single_value(v)
+            else :
+                value_obitype = update_obitype(value_obitype, v)
    
    return value_obitype


 cdef obitype_t get_obitype(object value) :
    
-    if type(value) == dict or type(value) == list or type(value) == tuple :
-        return get_obitype_iterable_value(value)
+    t = type(value)
+    if t == dict or t == list or t == tuple :
+        return get_obitype_iterable_value(value, t)
    
    else :
        return get_obitype_single_value(value)