diff --git a/python/obitools3/apps/optiongroups/__init__.py b/python/obitools3/apps/optiongroups/__init__.py index ba2c324..a907466 100755 --- a/python/obitools3/apps/optiongroups/__init__.py +++ b/python/obitools3/apps/optiongroups/__init__.py @@ -49,7 +49,13 @@ def __addImportInputOption(optionManager): action="store_const", dest="obi:inputformat", default=None, const=b'silva', - help="Input file is in SILVA fasta format") + help="Input file is in SILVA fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.") + + group.add_argument('--rdp-input', + action="store_const", dest="obi:inputformat", + default=None, + const=b'rdp', + help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.") group.add_argument('--embl-input', action="store_const", dest="obi:inputformat", diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index 56b1cce..3f3fc69 100755 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -34,14 +34,17 @@ from obitools3.dms.capi.obidms cimport obi_import_view from obitools3.dms.capi.obitypes cimport obitype_t, \ OBI_VOID, \ OBI_QUAL, \ - OBI_STR + OBI_STR, \ + OBI_INT from obitools3.dms.capi.obierrno cimport obi_errno from obitools3.apps.optiongroups import addImportInputOption, \ addTabularInputOption, \ addTaxdumpInputOption, \ - addMinimalOutputOption + addMinimalOutputOption, \ + addNoProgressBarOption, \ + addTaxonomyOption from obitools3.uri.decode import open_uri @@ -50,6 +53,7 @@ from obitools3.apps.config import logger from cpython.exc cimport PyErr_CheckSignals from io import BufferedWriter +import ast __title__="Import sequences from different formats into a DMS" @@ -69,7 +73,9 @@ def addOptions(parser): addImportInputOption(parser) addTabularInputOption(parser) addTaxdumpInputOption(parser) + addTaxonomyOption(parser) addMinimalOutputOption(parser) + addNoProgressBarOption(parser) group = parser.add_argument_group('obi import specific options') @@ -85,6 +91,10 @@ def addOptions(parser): help="If importing a view into another DMS, do it by importing each line, saving disk space if the original view " "has a line selection associated.") +# group.add_argument('--only-id', +# action="store", dest="import:onlyid", +# help="only id") + def run(config): cdef tuple input @@ -97,6 +107,7 @@ def run(config): cdef bint get_quality cdef bint NUC_SEQS_view cdef bint silva + cdef bint rdp cdef int nb_elts cdef object d cdef View view @@ -180,6 +191,16 @@ def run(config): logger("info", "Done.") return + # Open taxonomy if there is one + if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None: + taxo_uri = open_uri(config['obi']['taxoURI']) + if taxo_uri is None or taxo_uri[2] == bytes: + raise Exception("Couldn't open taxonomy") + taxo = taxo_uri[1] + + else : + taxo = None + # If importing a view between two DMS and not wanting to save space if line selection in original view, use C API if isinstance(input[1], View) and not config['import']['space_priority']: if obi_import_view(input[0].name_with_full_path, o_dms.name_with_full_path, input[1].name, tobytes((config['obi']['outputURI'].split('/'))[-1])) < 0 : @@ -192,8 +213,11 @@ def run(config): logger("info", "Done.") return - if entry_count >= 0: + # Reinitialize the progress bar + if entry_count >= 0 and config['obi']['noprogressbar'] == False: pb = ProgressBar(entry_count, config) + else: + pb = None NUC_SEQS_view = False if isinstance(output[1], View) : @@ -202,20 +226,25 @@ def run(config): NUC_SEQS_view = True else: raise NotImplementedError() - + # Save basic columns in variables for optimization if NUC_SEQS_view : id_col = view[ID_COLUMN] def_col = view[DEFINITION_COLUMN] seq_col = view[NUC_SEQUENCE_COLUMN] - # Prepare taxon scientific name if SILVA file - if 'inputformat' in config['obi'] and config['obi']['inputformat'] == b"silva": + # Prepare taxon scientific name and taxid refs if RDP or SILVA file + silva = False + rdp = False + if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp"): + #if taxo is None: + # raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files") silva = True - sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR) - else: - silva = False - + rdp = True + if taxo is not None: + sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR) + taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT) + dcols = {} # First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite @@ -265,8 +294,13 @@ def run(config): # Reinitialize the input if isinstance(input[0], CompressedFile): input_is_file = True - if entry_count >= 0: + + # Reinitialize the progress bar + if entry_count >= 0 and config['obi']['noprogressbar'] == False: pb = ProgressBar(entry_count, config) + else: + pb = None + try: input[0].close() except AttributeError: @@ -275,6 +309,11 @@ def run(config): if input is None: raise Exception("Could not open input URI") +# if 'onlyid' in config['import']: +# onlyid = tobytes(config['import']['onlyid']) +# else: +# onlyid = None + entries = input[1] i = 0 for entry in entries : @@ -292,6 +331,9 @@ def run(config): elif not i%50000: logger("info", "Imported %d entries", i) +# if onlyid is not None and entry.id != onlyid: +# continue + try: if NUC_SEQS_view: @@ -307,11 +349,18 @@ def run(config): if get_quality: qual_col[i] = entry.quality - # Parse taxon scientific name if SILVA file - if silva: - sci_name = entry.definition.split(b";")[-1] - sci_name_col[i] = sci_name - + # Parse taxon scientific name if RDP file + if (rdp or silva) and taxo is not None: + sci_names = entry.definition.split(b";") + for sci_name in reversed(sci_names): + if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome' : + taxon = taxo.get_taxon_by_name(sci_name) + if taxon is not None: + sci_name_col[i] = taxon.name + taxid_col[i] = taxon.taxid + #print(taxid_col[i], sci_name_col[i]) + break + for tag in entry : if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty @@ -323,27 +372,39 @@ def run(config): tag = COUNT_COLUMN if tag[:7] == b"merged_": tag = MERGED_PREFIX+tag[7:] - + + if type(value) == bytes and value[:1]==b"[" : + try: + if type(eval(value)) == list: + value = eval(value) + #print(value) + except: + pass + if tag not in dcols : - + value_type = type(value) nb_elts = 1 value_obitype = OBI_VOID dict_col = False - - if value_type == dict or value_type == list : + + if value_type == dict : nb_elts = len(value) elt_names = list(value) - if value_type == dict : - dict_col = True + dict_col = True else : nb_elts = 1 elt_names = None - + + if value_type == list : + tuples = True + else: + tuples = False + value_obitype = get_obitype(value) - + if value_obitype != OBI_VOID : - dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names, dict_column=dict_col), value_obitype) + dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names, dict_column=dict_col, tuples=tuples), value_obitype) # Fill value dcols[tag][0][i] = value @@ -419,11 +480,12 @@ def run(config): dcols[tag][0][i] = value except Exception as e: - print("\nCould not import sequence:", entry, "(error raised:", e, ")") + print("\nCould not import sequence:", repr(entry), "(error raised:", e, ")") if 'skiperror' in config['obi'] and not config['obi']['skiperror']: raise e else: pass + i-=1 # overwrite problematic entry i+=1 diff --git a/python/obitools3/utils.pxd b/python/obitools3/utils.pxd index 5c4672e..61f6142 100755 --- a/python/obitools3/utils.pxd +++ b/python/obitools3/utils.pxd @@ -18,7 +18,7 @@ cdef object clean_empty_values_from_object(object value, exclude=*) cdef obitype_t get_obitype_single_value(object value) cdef obitype_t update_obitype(obitype_t obitype, object new_value) -cdef obitype_t get_obitype_iterable_value(object value) +cdef obitype_t get_obitype_iterable_value(object value, type t) cdef obitype_t get_obitype(object value) cdef object __etag__(bytes x, bytes nastring=*) diff --git a/python/obitools3/utils.pyx b/python/obitools3/utils.pyx index e2e3e18..f6f02a8 100755 --- a/python/obitools3/utils.pyx +++ b/python/obitools3/utils.pyx @@ -260,38 +260,51 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value) : new_type = type(new_value) - if obitype == OBI_INT : - if new_type == float or new_value > OBI_INT_MAX : - return OBI_FLOAT + #if new_type == NoneType: # doesn't work because Cython sucks + if new_value == None or new_type==list or new_type==dict or new_type==tuple: + return obitype + # TODO BOOL vers INT/FLOAT - elif new_type == str or new_type == bytes : + if new_type == str or new_type == bytes : if obitype == OBI_SEQ and is_a_DNA_seq(tobytes(new_value)) : pass else : return OBI_STR - + elif obitype == OBI_INT : + if new_type == float or new_value > OBI_INT_MAX : + return OBI_FLOAT + return obitype -cdef obitype_t get_obitype_iterable_value(object value) : +cdef obitype_t get_obitype_iterable_value(object value, type t) : cdef obitype_t value_obitype value_obitype = OBI_VOID - for k in value : - if value_obitype == OBI_VOID : - value_obitype = get_obitype_single_value(value[k]) - else : - value_obitype = update_obitype(value_obitype, value[k]) + if t == dict: + for k in value : + if value_obitype == OBI_VOID : + value_obitype = get_obitype_single_value(value[k]) + else : + value_obitype = update_obitype(value_obitype, value[k]) + + elif t == list or t == tuple: + for v in value : + if value_obitype == OBI_VOID : + value_obitype = get_obitype_single_value(v) + else : + value_obitype = update_obitype(value_obitype, v) return value_obitype cdef obitype_t get_obitype(object value) : - if type(value) == dict or type(value) == list or type(value) == tuple : - return get_obitype_iterable_value(value) + t = type(value) + if t == dict or t == list or t == tuple : + return get_obitype_iterable_value(value, t) else : return get_obitype_single_value(value)