From b9b4cec5b5378afbf5b4cb67bfd509d428bed668 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 30 Oct 2020 10:43:04 +0100 Subject: [PATCH] import: now can import SILVA fasta files --- .../obitools3/apps/optiongroups/__init__.py | 6 +++++ python/obitools3/commands/import.pyx | 27 +++++++++++++++---- python/obitools3/uri/decode.pyx | 2 +- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/python/obitools3/apps/optiongroups/__init__.py b/python/obitools3/apps/optiongroups/__init__.py index 013ca8c..63cdfbb 100755 --- a/python/obitools3/apps/optiongroups/__init__.py +++ b/python/obitools3/apps/optiongroups/__init__.py @@ -39,6 +39,12 @@ def __addImportInputOption(optionManager): const=b'fastq', help="Input file is in fastq format") + group.add_argument('--silva-input', + action="store_const", dest="obi:inputformat", + default=None, + const=b'silva', + help="Input file is in SILVA fasta format") + group.add_argument('--embl-input', action="store_const", dest="obi:inputformat", default=None, diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index bb0a36f..ef64a59 100755 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -26,13 +26,15 @@ from obitools3.dms.capi.obiview cimport VIEW_TYPE_NUC_SEQS, \ QUALITY_COLUMN, \ COUNT_COLUMN, \ TAXID_COLUMN, \ - MERGED_PREFIX + MERGED_PREFIX, \ + SCIENTIFIC_NAME_COLUMN from obitools3.dms.capi.obidms cimport obi_import_view from obitools3.dms.capi.obitypes cimport obitype_t, \ OBI_VOID, \ - OBI_QUAL + OBI_QUAL, \ + OBI_STR from obitools3.dms.capi.obierrno cimport obi_errno @@ -94,6 +96,7 @@ def run(config): cdef obitype_t new_type cdef bint get_quality cdef bint NUC_SEQS_view + cdef bint silva cdef int nb_elts cdef object d cdef View view @@ -104,6 +107,8 @@ def run(config): cdef Column seq_col cdef Column qual_col cdef Column old_column + cdef Column sci_name_col + cdef bytes sci_name cdef bint rewrite cdef dict dcols cdef int skipping @@ -203,9 +208,16 @@ def run(config): id_col = view[ID_COLUMN] def_col = view[DEFINITION_COLUMN] seq_col = view[NUC_SEQUENCE_COLUMN] - + + # Prepare taxon scientific name if SILVA file + if 'inputformat' in config['obi'] and config['obi']['inputformat'] == b"silva": + silva = True + sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR) + else: + silva = False + dcols = {} - + # First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite if config['import']['preread']: logger("info", "First readthrough...") @@ -282,7 +294,7 @@ def run(config): try: if NUC_SEQS_view: - id_col[i] = entry.id + id_col[i] = entry.id def_col[i] = entry.definition seq_col[i] = entry.seq # Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet @@ -293,6 +305,11 @@ def run(config): qual_col = view[QUALITY_COLUMN] if get_quality: qual_col[i] = entry.quality + + # Parse taxon scientific name if SILVA file + if silva: + sci_name = entry.definition.split(b";")[-1] + sci_name_col[i] = sci_name for tag in entry : diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx index 7690bb8..3cf95c6 100644 --- a/python/obitools3/uri/decode.pyx +++ b/python/obitools3/uri/decode.pyx @@ -464,7 +464,7 @@ def open_uri(uri, if format is not None: if seqtype==b"nuc": objclass = Nuc_Seq # Nuc_Seq_Stored? TODO - if format==b"fasta": + if format==b"fasta" or format==b"silva": if input: iseq = fastaNucIterator(file, skip=skip,