import: now can import SILVA fasta files
This commit is contained in:
@ -39,6 +39,12 @@ def __addImportInputOption(optionManager):
|
|||||||
const=b'fastq',
|
const=b'fastq',
|
||||||
help="Input file is in fastq format")
|
help="Input file is in fastq format")
|
||||||
|
|
||||||
|
group.add_argument('--silva-input',
|
||||||
|
action="store_const", dest="obi:inputformat",
|
||||||
|
default=None,
|
||||||
|
const=b'silva',
|
||||||
|
help="Input file is in SILVA fasta format")
|
||||||
|
|
||||||
group.add_argument('--embl-input',
|
group.add_argument('--embl-input',
|
||||||
action="store_const", dest="obi:inputformat",
|
action="store_const", dest="obi:inputformat",
|
||||||
default=None,
|
default=None,
|
||||||
|
@ -26,13 +26,15 @@ from obitools3.dms.capi.obiview cimport VIEW_TYPE_NUC_SEQS, \
|
|||||||
QUALITY_COLUMN, \
|
QUALITY_COLUMN, \
|
||||||
COUNT_COLUMN, \
|
COUNT_COLUMN, \
|
||||||
TAXID_COLUMN, \
|
TAXID_COLUMN, \
|
||||||
MERGED_PREFIX
|
MERGED_PREFIX, \
|
||||||
|
SCIENTIFIC_NAME_COLUMN
|
||||||
|
|
||||||
from obitools3.dms.capi.obidms cimport obi_import_view
|
from obitools3.dms.capi.obidms cimport obi_import_view
|
||||||
|
|
||||||
from obitools3.dms.capi.obitypes cimport obitype_t, \
|
from obitools3.dms.capi.obitypes cimport obitype_t, \
|
||||||
OBI_VOID, \
|
OBI_VOID, \
|
||||||
OBI_QUAL
|
OBI_QUAL, \
|
||||||
|
OBI_STR
|
||||||
|
|
||||||
from obitools3.dms.capi.obierrno cimport obi_errno
|
from obitools3.dms.capi.obierrno cimport obi_errno
|
||||||
|
|
||||||
@ -94,6 +96,7 @@ def run(config):
|
|||||||
cdef obitype_t new_type
|
cdef obitype_t new_type
|
||||||
cdef bint get_quality
|
cdef bint get_quality
|
||||||
cdef bint NUC_SEQS_view
|
cdef bint NUC_SEQS_view
|
||||||
|
cdef bint silva
|
||||||
cdef int nb_elts
|
cdef int nb_elts
|
||||||
cdef object d
|
cdef object d
|
||||||
cdef View view
|
cdef View view
|
||||||
@ -104,6 +107,8 @@ def run(config):
|
|||||||
cdef Column seq_col
|
cdef Column seq_col
|
||||||
cdef Column qual_col
|
cdef Column qual_col
|
||||||
cdef Column old_column
|
cdef Column old_column
|
||||||
|
cdef Column sci_name_col
|
||||||
|
cdef bytes sci_name
|
||||||
cdef bint rewrite
|
cdef bint rewrite
|
||||||
cdef dict dcols
|
cdef dict dcols
|
||||||
cdef int skipping
|
cdef int skipping
|
||||||
@ -203,9 +208,16 @@ def run(config):
|
|||||||
id_col = view[ID_COLUMN]
|
id_col = view[ID_COLUMN]
|
||||||
def_col = view[DEFINITION_COLUMN]
|
def_col = view[DEFINITION_COLUMN]
|
||||||
seq_col = view[NUC_SEQUENCE_COLUMN]
|
seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||||
|
|
||||||
|
# Prepare taxon scientific name if SILVA file
|
||||||
|
if 'inputformat' in config['obi'] and config['obi']['inputformat'] == b"silva":
|
||||||
|
silva = True
|
||||||
|
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
|
||||||
|
else:
|
||||||
|
silva = False
|
||||||
|
|
||||||
dcols = {}
|
dcols = {}
|
||||||
|
|
||||||
# First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
|
# First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
|
||||||
if config['import']['preread']:
|
if config['import']['preread']:
|
||||||
logger("info", "First readthrough...")
|
logger("info", "First readthrough...")
|
||||||
@ -282,7 +294,7 @@ def run(config):
|
|||||||
try:
|
try:
|
||||||
|
|
||||||
if NUC_SEQS_view:
|
if NUC_SEQS_view:
|
||||||
id_col[i] = entry.id
|
id_col[i] = entry.id
|
||||||
def_col[i] = entry.definition
|
def_col[i] = entry.definition
|
||||||
seq_col[i] = entry.seq
|
seq_col[i] = entry.seq
|
||||||
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
|
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
|
||||||
@ -293,6 +305,11 @@ def run(config):
|
|||||||
qual_col = view[QUALITY_COLUMN]
|
qual_col = view[QUALITY_COLUMN]
|
||||||
if get_quality:
|
if get_quality:
|
||||||
qual_col[i] = entry.quality
|
qual_col[i] = entry.quality
|
||||||
|
|
||||||
|
# Parse taxon scientific name if SILVA file
|
||||||
|
if silva:
|
||||||
|
sci_name = entry.definition.split(b";")[-1]
|
||||||
|
sci_name_col[i] = sci_name
|
||||||
|
|
||||||
for tag in entry :
|
for tag in entry :
|
||||||
|
|
||||||
|
@ -464,7 +464,7 @@ def open_uri(uri,
|
|||||||
if format is not None:
|
if format is not None:
|
||||||
if seqtype==b"nuc":
|
if seqtype==b"nuc":
|
||||||
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
|
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
|
||||||
if format==b"fasta":
|
if format==b"fasta" or format==b"silva":
|
||||||
if input:
|
if input:
|
||||||
iseq = fastaNucIterator(file,
|
iseq = fastaNucIterator(file,
|
||||||
skip=skip,
|
skip=skip,
|
||||||
|
Reference in New Issue
Block a user