import: now can import SILVA fasta files

2020-10-30 10:43:04 +01:00
parent 199f3772e8
commit b9b4cec5b5
3 changed files with 29 additions and 6 deletions
--- a/python/obitools3/apps/optiongroups/init.py
+++ b/python/obitools3/apps/optiongroups/init.py
@ -39,6 +39,12 @@ def __addImportInputOption(optionManager):
                     const=b'fastq',
                     help="Input file is in fastq format")
    group.add_argument('--silva-input',
                     action="store_const", dest="obi:inputformat",
                     default=None,
                     const=b'silva',
                     help="Input file is in SILVA fasta format")
    group.add_argument('--embl-input',
                     action="store_const", dest="obi:inputformat",
                     default=None,
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -26,13 +26,15 @@ from obitools3.dms.capi.obiview cimport VIEW_TYPE_NUC_SEQS, \
                                        QUALITY_COLUMN, \
                                        COUNT_COLUMN, \
                                        TAXID_COLUMN, \
-                                        MERGED_PREFIX
+                                        MERGED_PREFIX, \
                                        SCIENTIFIC_NAME_COLUMN
 from obitools3.dms.capi.obidms cimport obi_import_view
 from obitools3.dms.capi.obitypes cimport obitype_t, \
                                         OBI_VOID, \
-                                         OBI_QUAL
+                                         OBI_QUAL, \
                                         OBI_STR
 from obitools3.dms.capi.obierrno cimport obi_errno
@ -94,6 +96,7 @@ def run(config):
    cdef   obitype_t   new_type
    cdef   bint        get_quality
    cdef   bint        NUC_SEQS_view
    cdef   bint        silva
    cdef   int         nb_elts
    cdef   object      d
    cdef   View        view
@ -104,6 +107,8 @@ def run(config):
    cdef   Column      seq_col
    cdef   Column      qual_col
    cdef   Column      old_column
    cdef   Column      sci_name_col
    cdef   bytes       sci_name
    cdef   bint        rewrite
    cdef   dict        dcols
    cdef   int         skipping
@ -204,6 +209,13 @@ def run(config):
        def_col = view[DEFINITION_COLUMN]
        seq_col = view[NUC_SEQUENCE_COLUMN]
    # Prepare taxon scientific name if SILVA file
    if 'inputformat' in config['obi'] and config['obi']['inputformat'] == b"silva":
        silva = True
        sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
    else:
        silva = False
    dcols = {}
    # First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
@ -294,6 +306,11 @@ def run(config):
                if get_quality:
                    qual_col[i] = entry.quality
                # Parse taxon scientific name if SILVA file
                if silva:
                    sci_name = entry.definition.split(b";")[-1]
                    sci_name_col[i] = sci_name
            for tag in entry :
                if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN :  # TODO dirty 
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -464,7 +464,7 @@ def open_uri(uri,
        if format is not None:
            if seqtype==b"nuc":
                objclass = Nuc_Seq    # Nuc_Seq_Stored? TODO
-                if format==b"fasta":
+                if format==b"fasta" or format==b"silva":
                    if input:
                        iseq = fastaNucIterator(file, 
                                                skip=skip,