From ef9d9674b00f0354581d133ca531c2c3e70cdf93 Mon Sep 17 00:00:00 2001 From: mercierc Date: Tue, 17 May 2022 09:36:33 +1200 Subject: [PATCH] obi import: added SINTAX format import and switch to version 3.0.1b18 --- .../obitools3/apps/optiongroups/__init__.py | 6 +++++ python/obitools3/commands/import.pyx | 27 ++++++++++++++++--- python/obitools3/uri/decode.pyx | 2 +- python/obitools3/version.py | 2 +- 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/python/obitools3/apps/optiongroups/__init__.py b/python/obitools3/apps/optiongroups/__init__.py index ac626d8..47b33db 100755 --- a/python/obitools3/apps/optiongroups/__init__.py +++ b/python/obitools3/apps/optiongroups/__init__.py @@ -63,6 +63,12 @@ def __addImportInputOption(optionManager): const=b'unite', help="Input file is in UNITE fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.") + group.add_argument('--sintax-input', + action="store_const", dest="obi:inputformat", + default=None, + const=b'sintax', + help="Input file is in SINTAX fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.") + group.add_argument('--embl-input', action="store_const", dest="obi:inputformat", default=None, diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index fde0237..89cd386 100755 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -109,6 +109,8 @@ def run(config): cdef bint NUC_SEQS_view cdef bint silva cdef bint rdp + cdef bint unite + cdef bint sintax cdef int nb_elts cdef object d cdef View view @@ -234,11 +236,15 @@ def run(config): def_col = view[DEFINITION_COLUMN] seq_col = view[NUC_SEQUENCE_COLUMN] - # Prepare taxon scientific name and taxid refs if RDP or SILVA file + # Prepare taxon scientific name and taxid refs if RDP/SILVA/UNITE/SINTAX formats silva = False rdp = False unite = False - if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp" or config['obi']['inputformat'] == b"unite"): + sintax=False + if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or \ + config['obi']['inputformat'] == b"rdp" or \ + config['obi']['inputformat'] == b"unite" or \ + config['obi']['inputformat'] == b"sintax"): #if taxo is None: # raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files") if config['obi']['inputformat'] == b"silva": @@ -247,6 +253,8 @@ def run(config): rdp = True elif config['obi']['inputformat'] == b"unite": unite = True + elif config['obi']['inputformat'] == b"sintax": + sintax = True sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR) if taxo is not None: taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT) @@ -356,13 +364,26 @@ def run(config): qual_col[i] = entry.quality # Parse taxon scientific name if RDP or Silva or Unite file - if (rdp or silva or unite): + if (rdp or silva or unite or sintax): if rdp or silva: sci_names = entry.definition.split(b";") sci_name_col[i] = sci_names[-1] elif unite: sci_names = entry.id.split(b'|')[-1].split(b';') sci_name_col[i] = re.sub(b'[a-zA-Z]__', b'', sci_names[-1]) + elif sintax: + reconstructed_line = entry.id+b' '+entry.definition[:-1] + splitted_reconstructed_line = reconstructed_line.split(b';') + taxa = splitted_reconstructed_line[1].split(b'=')[1] + taxa = splitted_reconstructed_line[1].split(b',') + sci_names = [] + for t in taxa: + tf = t.split(b':')[1] + sci_names.append(tf) + sci_name_col[i] = sci_names[-1] + id_col[i] = reconstructed_line.split(b';')[0] + def_col[i] = reconstructed_line + # Fond taxid if taxonomy provided if taxo is not None : for sci_name in reversed(sci_names): diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx index 86c5970..373054b 100644 --- a/python/obitools3/uri/decode.pyx +++ b/python/obitools3/uri/decode.pyx @@ -506,7 +506,7 @@ def open_uri(uri, if format is not None: if seqtype==b"nuc": objclass = Nuc_Seq # Nuc_Seq_Stored? TODO - if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite": + if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite" or format == b"sintax": if input: iseq = fastaNucIterator(file, skip=skip, diff --git a/python/obitools3/version.py b/python/obitools3/version.py index ddc8c61..d64bb53 100755 --- a/python/obitools3/version.py +++ b/python/obitools3/version.py @@ -1,5 +1,5 @@ major = 3 minor = 0 -serial= '1b17' +serial= '1b18' version ="%d.%d.%s" % (major,minor,serial)