import: now can import SILVA fasta files

This commit is contained in:
Celine Mercier
2020-10-30 10:43:04 +01:00
parent 199f3772e8
commit b9b4cec5b5
3 changed files with 29 additions and 6 deletions

View File

@ -39,6 +39,12 @@ def __addImportInputOption(optionManager):
const=b'fastq', const=b'fastq',
help="Input file is in fastq format") help="Input file is in fastq format")
group.add_argument('--silva-input',
action="store_const", dest="obi:inputformat",
default=None,
const=b'silva',
help="Input file is in SILVA fasta format")
group.add_argument('--embl-input', group.add_argument('--embl-input',
action="store_const", dest="obi:inputformat", action="store_const", dest="obi:inputformat",
default=None, default=None,

View File

@ -26,13 +26,15 @@ from obitools3.dms.capi.obiview cimport VIEW_TYPE_NUC_SEQS, \
QUALITY_COLUMN, \ QUALITY_COLUMN, \
COUNT_COLUMN, \ COUNT_COLUMN, \
TAXID_COLUMN, \ TAXID_COLUMN, \
MERGED_PREFIX MERGED_PREFIX, \
SCIENTIFIC_NAME_COLUMN
from obitools3.dms.capi.obidms cimport obi_import_view from obitools3.dms.capi.obidms cimport obi_import_view
from obitools3.dms.capi.obitypes cimport obitype_t, \ from obitools3.dms.capi.obitypes cimport obitype_t, \
OBI_VOID, \ OBI_VOID, \
OBI_QUAL OBI_QUAL, \
OBI_STR
from obitools3.dms.capi.obierrno cimport obi_errno from obitools3.dms.capi.obierrno cimport obi_errno
@ -94,6 +96,7 @@ def run(config):
cdef obitype_t new_type cdef obitype_t new_type
cdef bint get_quality cdef bint get_quality
cdef bint NUC_SEQS_view cdef bint NUC_SEQS_view
cdef bint silva
cdef int nb_elts cdef int nb_elts
cdef object d cdef object d
cdef View view cdef View view
@ -104,6 +107,8 @@ def run(config):
cdef Column seq_col cdef Column seq_col
cdef Column qual_col cdef Column qual_col
cdef Column old_column cdef Column old_column
cdef Column sci_name_col
cdef bytes sci_name
cdef bint rewrite cdef bint rewrite
cdef dict dcols cdef dict dcols
cdef int skipping cdef int skipping
@ -204,6 +209,13 @@ def run(config):
def_col = view[DEFINITION_COLUMN] def_col = view[DEFINITION_COLUMN]
seq_col = view[NUC_SEQUENCE_COLUMN] seq_col = view[NUC_SEQUENCE_COLUMN]
# Prepare taxon scientific name if SILVA file
if 'inputformat' in config['obi'] and config['obi']['inputformat'] == b"silva":
silva = True
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
else:
silva = False
dcols = {} dcols = {}
# First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite # First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
@ -294,6 +306,11 @@ def run(config):
if get_quality: if get_quality:
qual_col[i] = entry.quality qual_col[i] = entry.quality
# Parse taxon scientific name if SILVA file
if silva:
sci_name = entry.definition.split(b";")[-1]
sci_name_col[i] = sci_name
for tag in entry : for tag in entry :
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty

View File

@ -464,7 +464,7 @@ def open_uri(uri,
if format is not None: if format is not None:
if seqtype==b"nuc": if seqtype==b"nuc":
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
if format==b"fasta": if format==b"fasta" or format==b"silva":
if input: if input:
iseq = fastaNucIterator(file, iseq = fastaNucIterator(file,
skip=skip, skip=skip,