Cython API: obi import can now import ngsfilter files and tabular files

This commit is contained in:
Celine Mercier
2018-03-12 18:10:43 +01:00
parent 8a0b95c1d6
commit 15e43bb9a1
9 changed files with 168 additions and 142 deletions

View File

@ -1,13 +1,8 @@
#cython: language_level=3
# TODO cimport generate errors with argument numbers, but without them some variables can't be declared
import sys
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.files.universalopener cimport uopen
from obitools3.parsers.fasta import fastaIterator
from obitools3.parsers.fastq import fastqIterator
from obitools3.dms.view.view cimport View
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column
@ -24,7 +19,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
from obitools3.dms.capi.obierrno cimport obi_errno
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.apps.optiongroups import addSequenceInputOption, addTabularInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
@ -45,6 +40,7 @@ default_config = { 'destview' : None,
def addOptions(parser):
addSequenceInputOption(parser)
addTabularInputOption(parser)
addMinimalOutputOption(parser)
# addTaxdumpInputOption(parser)
@ -63,8 +59,8 @@ def run(config):
cdef int nb_elts
cdef object d
cdef View view
cdef object iseq
cdef object seq
cdef object entries
cdef object entry
cdef Column id_col
cdef Column def_col
cdef Column seq_col
@ -108,9 +104,9 @@ def run(config):
pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
iseq = input[1]
entries = input[1]
NA_value = config['obi']['nastring']
NA_value = tobytes(config['obi']['nastring']) # TODO
NUC_SEQS_view = False
if isinstance(output[1], View) :
@ -121,39 +117,39 @@ def run(config):
raise NotImplementedError()
# Save basic columns in variables for optimization
if NUC_SEQS_view :
id_col = view[b"ID"]
if NUC_SEQS_view :
id_col = view[b"ID"] # TODO use macros or globals for column names
def_col = view[b"DEFINITION"]
seq_col = view[b"NUC_SEQ"]
dcols = {}
i = 0
for seq in iseq :
for entry in entries :
pb(i)
if NUC_SEQS_view :
# Check if there is a sequencing quality associated # TODO
if i == 0:
get_quality = b"QUALITY" in seq
get_quality = b"QUALITY" in entry
if get_quality:
Column.new_column(view, b"QUALITY", OBI_QUAL)
qual_col = view[b"QUALITY"]
id_col[i] = seq.id
def_col[i] = seq.definition
seq_col[i] = seq.seq
id_col[i] = entry.id
def_col[i] = entry.definition
seq_col[i] = entry.seq
if get_quality :
qual_col[i] = seq.quality
qual_col[i] = entry.quality
for tag in seq :
for tag in entry :
if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm...
value = seq[tag]
value = entry[tag]
# Check NA value
if value == NA_value :