Cython API: obi import can now import ngsfilter files and tabular files
This commit is contained in:
@ -31,7 +31,7 @@ default_config = { 'software' : "The OBITools",
|
|||||||
'fileformat' : None,
|
'fileformat' : None,
|
||||||
'skiperror' : True,
|
'skiperror' : True,
|
||||||
'qualityformat' : b'sanger',
|
'qualityformat' : b'sanger',
|
||||||
'qualityoffset' : -1,
|
'offset' : -1,
|
||||||
'noquality' : False,
|
'noquality' : False,
|
||||||
'seqtype' : b'nuc',
|
'seqtype' : b'nuc',
|
||||||
"header" : False,
|
"header" : False,
|
||||||
|
@ -25,8 +25,8 @@ def __addInputOption(optionManager):
|
|||||||
|
|
||||||
group.add_argument('--na-string',
|
group.add_argument('--na-string',
|
||||||
action="store", dest="obi:nastring",
|
action="store", dest="obi:nastring",
|
||||||
default=b"NA",
|
default="NA",
|
||||||
type=bytes,
|
type=str,
|
||||||
help="String associated to Non Available (NA) values")
|
help="String associated to Non Available (NA) values")
|
||||||
|
|
||||||
|
|
||||||
@ -61,7 +61,7 @@ def __addSequenceInputOption(optionManager):
|
|||||||
action="store_const", dest="obi:format",
|
action="store_const", dest="obi:format",
|
||||||
default=None,
|
default=None,
|
||||||
const=b'ngsfilter',
|
const=b'ngsfilter',
|
||||||
help="Input file is a ngsfilter file")
|
help="Input file is an ngsfilter file")
|
||||||
|
|
||||||
group.add_argument('--ecopcr-result',
|
group.add_argument('--ecopcr-result',
|
||||||
action="store_const", dest="obi:format",
|
action="store_const", dest="obi:format",
|
||||||
@ -75,6 +75,12 @@ def __addSequenceInputOption(optionManager):
|
|||||||
const=b'ecoprimers',
|
const=b'ecoprimers',
|
||||||
help="Input file is the result of an ecoprimers")
|
help="Input file is the result of an ecoprimers")
|
||||||
|
|
||||||
|
group.add_argument('--tabular',
|
||||||
|
action="store_const", dest="obi:format",
|
||||||
|
default=None,
|
||||||
|
const=b'tabular',
|
||||||
|
help="Input file is a tabular file")
|
||||||
|
|
||||||
group.add_argument('--skip-on-error',
|
group.add_argument('--skip-on-error',
|
||||||
action="store_true", dest="obi:skiperror",
|
action="store_true", dest="obi:skiperror",
|
||||||
default=False,
|
default=False,
|
||||||
@ -120,13 +126,13 @@ def __addTabularInputOption(optionManager):
|
|||||||
group.add_argument('--sep',
|
group.add_argument('--sep',
|
||||||
action="store", dest="obi:sep",
|
action="store", dest="obi:sep",
|
||||||
default=None,
|
default=None,
|
||||||
type=bytes,
|
type=str,
|
||||||
help="Column separator")
|
help="Column separator")
|
||||||
|
|
||||||
group.add_argument('--dec',
|
group.add_argument('--dec',
|
||||||
action="store", dest="obi:dec",
|
action="store", dest="obi:dec",
|
||||||
default=b".",
|
default=".",
|
||||||
type=bytes,
|
type=str,
|
||||||
help="Decimal separator")
|
help="Decimal separator")
|
||||||
|
|
||||||
group.add_argument('--strip-white',
|
group.add_argument('--strip-white',
|
||||||
@ -141,8 +147,8 @@ def __addTabularInputOption(optionManager):
|
|||||||
|
|
||||||
group.add_argument('--comment-char',
|
group.add_argument('--comment-char',
|
||||||
action="store", dest="obi:commentchar",
|
action="store", dest="obi:commentchar",
|
||||||
default=b"#",
|
default="#",
|
||||||
type=bytes,
|
type=str,
|
||||||
help="Lines starting by this char are considered as comment")
|
help="Lines starting by this char are considered as comment")
|
||||||
|
|
||||||
def __addTaxonomyInputOption(optionManager):
|
def __addTaxonomyInputOption(optionManager):
|
||||||
@ -171,7 +177,7 @@ def addSequenceInputOption(optionManager):
|
|||||||
__addSequenceInputOption(optionManager)
|
__addSequenceInputOption(optionManager)
|
||||||
|
|
||||||
def addTabularInputOption(optionManager):
|
def addTabularInputOption(optionManager):
|
||||||
__addInputOption(optionManager)
|
#__addInputOption(optionManager) # TODO discuss conflict
|
||||||
__addTabularInputOption(optionManager)
|
__addTabularInputOption(optionManager)
|
||||||
|
|
||||||
def addTaxonomyInputOption(optionManager):
|
def addTaxonomyInputOption(optionManager):
|
||||||
|
@ -1,13 +1,8 @@
|
|||||||
#cython: language_level=3
|
#cython: language_level=3
|
||||||
|
|
||||||
# TODO cimport generate errors with argument numbers, but without them some variables can't be declared
|
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||||
from obitools3.files.universalopener cimport uopen
|
|
||||||
from obitools3.parsers.fasta import fastaIterator
|
|
||||||
from obitools3.parsers.fastq import fastqIterator
|
|
||||||
from obitools3.dms.view.view cimport View
|
from obitools3.dms.view.view cimport View
|
||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
from obitools3.dms.column.column cimport Column
|
from obitools3.dms.column.column cimport Column
|
||||||
@ -24,7 +19,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
|
|||||||
|
|
||||||
from obitools3.dms.capi.obierrno cimport obi_errno
|
from obitools3.dms.capi.obierrno cimport obi_errno
|
||||||
|
|
||||||
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
|
from obitools3.apps.optiongroups import addSequenceInputOption, addTabularInputOption, addMinimalOutputOption
|
||||||
|
|
||||||
from obitools3.uri.decode import open_uri
|
from obitools3.uri.decode import open_uri
|
||||||
|
|
||||||
@ -45,6 +40,7 @@ default_config = { 'destview' : None,
|
|||||||
def addOptions(parser):
|
def addOptions(parser):
|
||||||
|
|
||||||
addSequenceInputOption(parser)
|
addSequenceInputOption(parser)
|
||||||
|
addTabularInputOption(parser)
|
||||||
addMinimalOutputOption(parser)
|
addMinimalOutputOption(parser)
|
||||||
# addTaxdumpInputOption(parser)
|
# addTaxdumpInputOption(parser)
|
||||||
|
|
||||||
@ -63,8 +59,8 @@ def run(config):
|
|||||||
cdef int nb_elts
|
cdef int nb_elts
|
||||||
cdef object d
|
cdef object d
|
||||||
cdef View view
|
cdef View view
|
||||||
cdef object iseq
|
cdef object entries
|
||||||
cdef object seq
|
cdef object entry
|
||||||
cdef Column id_col
|
cdef Column id_col
|
||||||
cdef Column def_col
|
cdef Column def_col
|
||||||
cdef Column seq_col
|
cdef Column seq_col
|
||||||
@ -108,9 +104,9 @@ def run(config):
|
|||||||
|
|
||||||
pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
|
pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
|
||||||
|
|
||||||
iseq = input[1]
|
entries = input[1]
|
||||||
|
|
||||||
NA_value = config['obi']['nastring']
|
NA_value = tobytes(config['obi']['nastring']) # TODO
|
||||||
|
|
||||||
NUC_SEQS_view = False
|
NUC_SEQS_view = False
|
||||||
if isinstance(output[1], View) :
|
if isinstance(output[1], View) :
|
||||||
@ -121,39 +117,39 @@ def run(config):
|
|||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
# Save basic columns in variables for optimization
|
# Save basic columns in variables for optimization
|
||||||
if NUC_SEQS_view :
|
if NUC_SEQS_view :
|
||||||
id_col = view[b"ID"]
|
id_col = view[b"ID"] # TODO use macros or globals for column names
|
||||||
def_col = view[b"DEFINITION"]
|
def_col = view[b"DEFINITION"]
|
||||||
seq_col = view[b"NUC_SEQ"]
|
seq_col = view[b"NUC_SEQ"]
|
||||||
|
|
||||||
dcols = {}
|
dcols = {}
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
for seq in iseq :
|
for entry in entries :
|
||||||
|
|
||||||
pb(i)
|
pb(i)
|
||||||
|
|
||||||
if NUC_SEQS_view :
|
if NUC_SEQS_view :
|
||||||
|
|
||||||
# Check if there is a sequencing quality associated # TODO
|
# Check if there is a sequencing quality associated # TODO
|
||||||
if i == 0:
|
if i == 0:
|
||||||
get_quality = b"QUALITY" in seq
|
get_quality = b"QUALITY" in entry
|
||||||
if get_quality:
|
if get_quality:
|
||||||
Column.new_column(view, b"QUALITY", OBI_QUAL)
|
Column.new_column(view, b"QUALITY", OBI_QUAL)
|
||||||
qual_col = view[b"QUALITY"]
|
qual_col = view[b"QUALITY"]
|
||||||
|
|
||||||
id_col[i] = seq.id
|
id_col[i] = entry.id
|
||||||
def_col[i] = seq.definition
|
def_col[i] = entry.definition
|
||||||
seq_col[i] = seq.seq
|
seq_col[i] = entry.seq
|
||||||
|
|
||||||
if get_quality :
|
if get_quality :
|
||||||
qual_col[i] = seq.quality
|
qual_col[i] = entry.quality
|
||||||
|
|
||||||
for tag in seq :
|
for tag in entry :
|
||||||
|
|
||||||
if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm...
|
if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm...
|
||||||
|
|
||||||
value = seq[tag]
|
value = entry[tag]
|
||||||
|
|
||||||
# Check NA value
|
# Check NA value
|
||||||
if value == NA_value :
|
if value == NA_value :
|
||||||
|
@ -17,7 +17,6 @@ def fastaIterator(lineiterator,
|
|||||||
firstline=None,
|
firstline=None,
|
||||||
int buffersize=100000000
|
int buffersize=100000000
|
||||||
):
|
):
|
||||||
cdef LineBuffer lb
|
|
||||||
cdef str ident
|
cdef str ident
|
||||||
cdef str definition
|
cdef str definition
|
||||||
cdef dict tags
|
cdef dict tags
|
||||||
@ -31,23 +30,26 @@ def fastaIterator(lineiterator,
|
|||||||
else:
|
else:
|
||||||
ionly=int(only)
|
ionly=int(only)
|
||||||
|
|
||||||
if isinstance(lineiterator,(str,bytes)):
|
if isinstance(lineiterator, (str, bytes)):
|
||||||
lineiterator=uopen(lineiterator)
|
lineiterator=uopen(lineiterator)
|
||||||
|
|
||||||
if isinstance(lineiterator, LineBuffer):
|
if isinstance(lineiterator, LineBuffer):
|
||||||
lb=lineiterator
|
iterator = iter(lineiterator)
|
||||||
else:
|
else:
|
||||||
lb=LineBuffer(lineiterator,buffersize)
|
if hasattr(lineiterator, "readlines"):
|
||||||
|
iterator = iter(LineBuffer(lineiterator, buffersize))
|
||||||
|
elif hasattr(lineiterator, '__next__'):
|
||||||
|
iterator = lineiterator
|
||||||
|
else:
|
||||||
|
raise Exception("Invalid line iterator")
|
||||||
|
|
||||||
skipped = 0
|
skipped = 0
|
||||||
i = iter(lb)
|
i = iterator
|
||||||
|
|
||||||
if firstline is None:
|
if firstline is None:
|
||||||
line = next(i)
|
line = next(i)
|
||||||
else:
|
else:
|
||||||
line = firstline
|
line = firstline
|
||||||
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
if ionly >= 0 and read >= ionly:
|
if ionly >= 0 and read >= ionly:
|
||||||
@ -81,7 +83,7 @@ def fastaIterator(lineiterator,
|
|||||||
# definition,
|
# definition,
|
||||||
# tags=tags,
|
# tags=tags,
|
||||||
# )
|
# )
|
||||||
# TODO
|
# TODO Seq object
|
||||||
yield { "id" : ident,
|
yield { "id" : ident,
|
||||||
"definition" : definition,
|
"definition" : definition,
|
||||||
"sequence" : sequence,
|
"sequence" : sequence,
|
||||||
@ -100,7 +102,6 @@ def fastaNucIterator(lineiterator,
|
|||||||
firstline=None,
|
firstline=None,
|
||||||
int buffersize=100000000
|
int buffersize=100000000
|
||||||
):
|
):
|
||||||
cdef LineBuffer lb
|
|
||||||
cdef str ident
|
cdef str ident
|
||||||
cdef str definition
|
cdef str definition
|
||||||
cdef dict tags
|
cdef dict tags
|
||||||
@ -115,14 +116,16 @@ def fastaNucIterator(lineiterator,
|
|||||||
ionly = int(only)
|
ionly = int(only)
|
||||||
|
|
||||||
if isinstance(lineiterator, (str, bytes)):
|
if isinstance(lineiterator, (str, bytes)):
|
||||||
lineiterator=uopen(lineiterator)
|
lineiterator=uopen(lineiterator)
|
||||||
|
|
||||||
if isinstance(lineiterator, types.GeneratorType):
|
|
||||||
iterator = lineiterator
|
|
||||||
if isinstance(lineiterator, LineBuffer):
|
if isinstance(lineiterator, LineBuffer):
|
||||||
iterator = iter(lineiterator)
|
iterator = iter(lineiterator)
|
||||||
else:
|
else:
|
||||||
iterator = iter(LineBuffer(lineiterator, buffersize))
|
if hasattr(lineiterator, "readlines"):
|
||||||
|
iterator = iter(LineBuffer(lineiterator, buffersize))
|
||||||
|
elif hasattr(lineiterator, '__next__'):
|
||||||
|
iterator = lineiterator
|
||||||
|
else:
|
||||||
|
raise Exception("Invalid line iterator")
|
||||||
|
|
||||||
skipped = 0
|
skipped = 0
|
||||||
read = 0
|
read = 0
|
||||||
|
@ -12,7 +12,7 @@ from obitools3.dms.obiseq cimport Nuc_Seq
|
|||||||
def fastqIterator(lineiterator,
|
def fastqIterator(lineiterator,
|
||||||
int skip=0,
|
int skip=0,
|
||||||
only=None,
|
only=None,
|
||||||
int qualityoffset=-1,
|
int offset=-1,
|
||||||
bint noquality=False,
|
bint noquality=False,
|
||||||
firstline=None,
|
firstline=None,
|
||||||
int buffersize=100000000
|
int buffersize=100000000
|
||||||
@ -25,14 +25,14 @@ def fastqIterator(lineiterator,
|
|||||||
else:
|
else:
|
||||||
return fastqWithQualityIterator(lineiterator,
|
return fastqWithQualityIterator(lineiterator,
|
||||||
skip,only,
|
skip,only,
|
||||||
qualityoffset,
|
offset,
|
||||||
firstline,
|
firstline,
|
||||||
buffersize)
|
buffersize)
|
||||||
|
|
||||||
def fastqWithQualityIterator(lineiterator,
|
def fastqWithQualityIterator(lineiterator,
|
||||||
int skip=0,
|
int skip=0,
|
||||||
only=None,
|
only=None,
|
||||||
int qualityoffset=-1,
|
int offset=-1,
|
||||||
firstline=None,
|
firstline=None,
|
||||||
int buffersize=100000000
|
int buffersize=100000000
|
||||||
):
|
):
|
||||||
@ -49,21 +49,25 @@ def fastqWithQualityIterator(lineiterator,
|
|||||||
ionly=-1
|
ionly=-1
|
||||||
else:
|
else:
|
||||||
ionly=int(only)
|
ionly=int(only)
|
||||||
|
|
||||||
if isinstance(lineiterator,(str,bytes)):
|
if isinstance(lineiterator, (str, bytes)):
|
||||||
lineiterator=uopen(lineiterator)
|
lineiterator=uopen(lineiterator)
|
||||||
|
|
||||||
if isinstance(lineiterator, LineBuffer):
|
if isinstance(lineiterator, LineBuffer):
|
||||||
lb=lineiterator
|
iterator = iter(lineiterator)
|
||||||
else:
|
else:
|
||||||
lb=LineBuffer(lineiterator,buffersize)
|
if hasattr(lineiterator, "readlines"):
|
||||||
|
iterator = iter(LineBuffer(lineiterator, buffersize))
|
||||||
i = iter(lb)
|
elif hasattr(lineiterator, '__next__'):
|
||||||
|
iterator = lineiterator
|
||||||
|
else:
|
||||||
|
raise Exception("Invalid line iterator")
|
||||||
|
|
||||||
|
i = iterator
|
||||||
lines_to_skip = skip*4 - (firstline is not None)
|
lines_to_skip = skip*4 - (firstline is not None)
|
||||||
|
|
||||||
for skipped in range(lines_to_skip):
|
for skipped in range(lines_to_skip):
|
||||||
next(i)
|
next(i)
|
||||||
|
|
||||||
if skip > 0:
|
if skip > 0:
|
||||||
firstline=None
|
firstline=None
|
||||||
|
|
||||||
@ -88,7 +92,7 @@ def fastqWithQualityIterator(lineiterator,
|
|||||||
sequence,
|
sequence,
|
||||||
definition=definition,
|
definition=definition,
|
||||||
quality=quality,
|
quality=quality,
|
||||||
offset=qualityoffset,
|
offset=offset,
|
||||||
tags=tags)
|
tags=tags)
|
||||||
|
|
||||||
yield seq
|
yield seq
|
||||||
@ -97,7 +101,7 @@ def fastqWithQualityIterator(lineiterator,
|
|||||||
# "definition" : definition,
|
# "definition" : definition,
|
||||||
# "sequence" : sequence,
|
# "sequence" : sequence,
|
||||||
# "quality" : quality,
|
# "quality" : quality,
|
||||||
# "offset" : qualityoffset,
|
# "offset" : offset,
|
||||||
# "tags" : tags,
|
# "tags" : tags,
|
||||||
# "annotation" : {}
|
# "annotation" : {}
|
||||||
# }
|
# }
|
||||||
@ -112,7 +116,6 @@ def fastqWithoutQualityIterator(lineiterator,
|
|||||||
firstline=None,
|
firstline=None,
|
||||||
int buffersize=100000000
|
int buffersize=100000000
|
||||||
):
|
):
|
||||||
cdef LineBuffer lb
|
|
||||||
cdef str ident
|
cdef str ident
|
||||||
cdef str definition
|
cdef str definition
|
||||||
cdef dict tags
|
cdef dict tags
|
||||||
@ -126,15 +129,19 @@ def fastqWithoutQualityIterator(lineiterator,
|
|||||||
else:
|
else:
|
||||||
ionly=int(only)
|
ionly=int(only)
|
||||||
|
|
||||||
if isinstance(lineiterator,(str,bytes)):
|
if isinstance(lineiterator, (str, bytes)):
|
||||||
lineiterator=uopen(lineiterator)
|
lineiterator=uopen(lineiterator)
|
||||||
|
|
||||||
if isinstance(lineiterator, LineBuffer):
|
if isinstance(lineiterator, LineBuffer):
|
||||||
lb=lineiterator
|
iterator = iter(lineiterator)
|
||||||
else:
|
else:
|
||||||
lb=LineBuffer(lineiterator,buffersize)
|
if hasattr(lineiterator, "readlines"):
|
||||||
|
iterator = iter(LineBuffer(lineiterator, buffersize))
|
||||||
|
elif hasattr(lineiterator, '__next__'):
|
||||||
|
iterator = lineiterator
|
||||||
|
else:
|
||||||
|
raise Exception("Invalid line iterator")
|
||||||
|
|
||||||
i = iter(lb)
|
i = iterator
|
||||||
lines_to_skip = skip*4 - (firstline is not None)
|
lines_to_skip = skip*4 - (firstline is not None)
|
||||||
|
|
||||||
for skipped in range(lines_to_skip):
|
for skipped in range(lines_to_skip):
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
#cython: language_level=3
|
#cython: language_level=3
|
||||||
|
|
||||||
cdef object __etag__(str x)
|
|
||||||
|
|
||||||
cpdef tuple parseHeader(str header)
|
cpdef tuple parseHeader(str header)
|
||||||
|
@ -6,54 +6,12 @@ Created on 25 mars 2016
|
|||||||
@author: coissac
|
@author: coissac
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
from obitools3.utils cimport __etag__
|
||||||
import re
|
import re
|
||||||
|
|
||||||
__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
|
|
||||||
__re_int__ = re.compile("^[+-]?[0-9]+$")
|
|
||||||
__re_float__ = re.compile("^[+-]?[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?$")
|
|
||||||
__re_str__ = re.compile("""^"[^"]*"|'[^']*'$""")
|
|
||||||
__re_dict__ = re.compile("""^\{\ *
|
|
||||||
(
|
|
||||||
("[^"]*"|'[^']*')
|
|
||||||
\ *:\ *
|
|
||||||
([^,}]+|
|
|
||||||
"[^"]*"|
|
|
||||||
'[^']*'
|
|
||||||
)
|
|
||||||
)?
|
|
||||||
(\ *,\ *
|
|
||||||
("[^"]*"|'[^']*')
|
|
||||||
\ *:\ *
|
|
||||||
([^,}]+|
|
|
||||||
"[^"]*"|
|
|
||||||
'[^']*'
|
|
||||||
)
|
|
||||||
)*\ *\}$""", re.VERBOSE)
|
|
||||||
|
|
||||||
__re_val__ = re.compile("""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""")
|
__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
|
||||||
|
|
||||||
cdef object __etag__(str x):
|
|
||||||
cdef list elements
|
|
||||||
cdef tuple i
|
|
||||||
|
|
||||||
if __re_int__.match(x):
|
|
||||||
v=int(x)
|
|
||||||
elif __re_float__.match(x):
|
|
||||||
v=float(x)
|
|
||||||
elif __re_str__.match(x):
|
|
||||||
v=x[1:-1]
|
|
||||||
elif x=='None':
|
|
||||||
v=None
|
|
||||||
elif x=='False':
|
|
||||||
v=False
|
|
||||||
elif x=='True':
|
|
||||||
v=True
|
|
||||||
elif __re_dict__.match(x):
|
|
||||||
elements=__re_val__.findall(x)
|
|
||||||
v=dict([(i[1][1:-1],__etag__(i[2])) for i in elements])
|
|
||||||
else:
|
|
||||||
v=x
|
|
||||||
return v
|
|
||||||
|
|
||||||
cpdef tuple parseHeader(str header):
|
cpdef tuple parseHeader(str header):
|
||||||
cdef list m
|
cdef list m
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
import re
|
import re
|
||||||
from obitools3.parsers.fasta import fastaNucIterator
|
from obitools3.parsers.fasta import fastaNucIterator
|
||||||
from obitools3.parsers.fastq import fastqIterator
|
from obitools3.parsers.fastq import fastqIterator
|
||||||
|
from obitools3.parsers.tab import tabIterator
|
||||||
|
from obitools3.parsers.ngsfilter import ngsfilterIterator
|
||||||
|
|
||||||
|
|
||||||
oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I)
|
oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I)
|
||||||
tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
|
tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
|
||||||
|
|
||||||
def is_ngsfilter_line(line):
|
def is_ngsfilter_line(line): # TODO doesn't work?
|
||||||
try:
|
try:
|
||||||
parts = line.split()
|
parts = line.split()
|
||||||
ok = tagre.match(parts[2])
|
ok = tagre.match(parts[2])
|
||||||
@ -23,7 +25,7 @@ def entryIteratorFactory(lineiterator,
|
|||||||
int skip=0,
|
int skip=0,
|
||||||
only=None,
|
only=None,
|
||||||
bytes seqtype=b'nuc',
|
bytes seqtype=b'nuc',
|
||||||
int qualityoffset=-1,
|
int offset=-1,
|
||||||
bint noquality=False,
|
bint noquality=False,
|
||||||
bint skiperror=True,
|
bint skiperror=True,
|
||||||
bint header=False,
|
bint header=False,
|
||||||
@ -35,15 +37,19 @@ def entryIteratorFactory(lineiterator,
|
|||||||
bytes commentchar=b"#",
|
bytes commentchar=b"#",
|
||||||
int buffersize=100000000):
|
int buffersize=100000000):
|
||||||
|
|
||||||
if isinstance(lineiterator,(str,bytes)):
|
if isinstance(lineiterator, (str, bytes)):
|
||||||
lineiterator=uopen(lineiterator)
|
lineiterator=uopen(lineiterator)
|
||||||
|
|
||||||
if isinstance(lineiterator, LineBuffer):
|
if isinstance(lineiterator, LineBuffer):
|
||||||
lb=lineiterator
|
iterator = iter(lineiterator)
|
||||||
else:
|
else:
|
||||||
lb=LineBuffer(lineiterator, buffersize)
|
if hasattr(lineiterator, "readlines"):
|
||||||
|
iterator = iter(LineBuffer(lineiterator, buffersize))
|
||||||
|
elif hasattr(lineiterator, '__next__'):
|
||||||
|
iterator = lineiterator
|
||||||
|
else:
|
||||||
|
raise Exception("Invalid line iterator")
|
||||||
|
|
||||||
i = iter(lb)
|
i = iterator
|
||||||
|
|
||||||
first=next(i)
|
first=next(i)
|
||||||
|
|
||||||
@ -57,11 +63,11 @@ def entryIteratorFactory(lineiterator,
|
|||||||
format=b"embl"
|
format=b"embl"
|
||||||
elif first[0:6]=='LOCUS ':
|
elif first[0:6]=='LOCUS ':
|
||||||
format=b"genbank"
|
format=b"genbank"
|
||||||
elif first[0:11]=='#@ecopcr-v2':
|
elif first[0:11]=='#@ecopcr-v2': # TODO v2????
|
||||||
format=b"ecopcrfile"
|
format=b"ecopcrfile"
|
||||||
elif is_ngsfilter_line(first):
|
elif is_ngsfilter_line(first):
|
||||||
format=b"ngsfilter"
|
format=b"ngsfilter"
|
||||||
|
|
||||||
# TODO Temporary fix
|
# TODO Temporary fix
|
||||||
first=None
|
first=None
|
||||||
lineiterator.seek(0)
|
lineiterator.seek(0)
|
||||||
@ -78,12 +84,36 @@ def entryIteratorFactory(lineiterator,
|
|||||||
elif format==b'fastq':
|
elif format==b'fastq':
|
||||||
return (fastqIterator(lineiterator,
|
return (fastqIterator(lineiterator,
|
||||||
skip=skip,only=only,
|
skip=skip,only=only,
|
||||||
qualityoffset=qualityoffset,
|
offset=offset,
|
||||||
noquality=noquality,
|
noquality=noquality,
|
||||||
firstline=first,
|
firstline=first,
|
||||||
buffersize=buffersize),
|
buffersize=buffersize),
|
||||||
Nuc_Seq)
|
Nuc_Seq)
|
||||||
|
elif format==b'tabular':
|
||||||
|
return (tabIterator(lineiterator,
|
||||||
|
header = header,
|
||||||
|
sep = sep,
|
||||||
|
dec = dec,
|
||||||
|
stripwhite = stripwhite,
|
||||||
|
blanklineskip = blanklineskip,
|
||||||
|
commentchar = commentchar,
|
||||||
|
skip = skip,
|
||||||
|
only = only,
|
||||||
|
firstline=first,
|
||||||
|
buffersize=buffersize),
|
||||||
|
dict)
|
||||||
|
elif format==b'ngsfilter':
|
||||||
|
return (ngsfilterIterator(lineiterator,
|
||||||
|
sep = sep,
|
||||||
|
dec = dec,
|
||||||
|
stripwhite = stripwhite,
|
||||||
|
blanklineskip = blanklineskip,
|
||||||
|
commentchar = commentchar,
|
||||||
|
skip = skip,
|
||||||
|
only = only,
|
||||||
|
firstline=first,
|
||||||
|
buffersize=buffersize),
|
||||||
|
dict)
|
||||||
|
|
||||||
raise NotImplementedError('File format not yet implemented')
|
raise NotImplementedError('File format not yet implemented')
|
||||||
|
|
||||||
|
@ -7,11 +7,15 @@ from obitools3.dms.dms import DMS
|
|||||||
|
|
||||||
from obitools3.parsers.fasta import fastaNucIterator
|
from obitools3.parsers.fasta import fastaNucIterator
|
||||||
from obitools3.parsers.fastq import fastqIterator
|
from obitools3.parsers.fastq import fastqIterator
|
||||||
|
from obitools3.parsers.tab import tabIterator
|
||||||
|
from obitools3.parsers.ngsfilter import ngsfilterIterator
|
||||||
from obitools3.parsers.universal import entryIteratorFactory
|
from obitools3.parsers.universal import entryIteratorFactory
|
||||||
|
|
||||||
from obitools3.dms.obiseq import Nuc_Seq
|
from obitools3.dms.obiseq import Nuc_Seq
|
||||||
from obitools3.apps.config import getConfiguration,logger
|
from obitools3.apps.config import getConfiguration,logger
|
||||||
from obitools3.apps.temp import get_temp_dms
|
from obitools3.apps.temp import get_temp_dms
|
||||||
|
from obitools3.utils cimport tobytes # TODO because can't read options as bytes
|
||||||
|
|
||||||
|
|
||||||
class MalformedURIException(RuntimeError):
|
class MalformedURIException(RuntimeError):
|
||||||
pass
|
pass
|
||||||
@ -210,22 +214,24 @@ def open_uri(uri,
|
|||||||
if file is not None:
|
if file is not None:
|
||||||
qualifiers=parse_qs(urip.query)
|
qualifiers=parse_qs(urip.query)
|
||||||
|
|
||||||
|
|
||||||
if b'format' in qualifiers:
|
if b'format' in qualifiers:
|
||||||
format = qualifiers[b'format'][0]
|
format = qualifiers[b'format'][0]
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
format=config["obi"]["fileformat"]
|
format=config["obi"]["format"]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
format=None
|
format=None
|
||||||
|
|
||||||
if b'seqtype' in qualifiers:
|
if b'seqtype' in qualifiers:
|
||||||
seqtype=qualifiers[b'seqtype'][0]
|
seqtype=qualifiers[b'seqtype'][0]
|
||||||
else:
|
else:
|
||||||
try:
|
if format == b"ngsfilter": # TODO discuss
|
||||||
seqtype=config["obi"]["seqtype"]
|
seqtype=None
|
||||||
except KeyError:
|
else:
|
||||||
seqtype=b'nuc'
|
try:
|
||||||
|
seqtype=config["obi"]["seqtype"]
|
||||||
|
except KeyError:
|
||||||
|
seqtype=b"nuc"
|
||||||
|
|
||||||
if b'skip' in qualifiers:
|
if b'skip' in qualifiers:
|
||||||
skip=int(qualifiers[b"skip"][0])
|
skip=int(qualifiers[b"skip"][0])
|
||||||
@ -286,7 +292,7 @@ def open_uri(uri,
|
|||||||
offset=33
|
offset=33
|
||||||
elif config["obi"]["qualityformat"][0]=="solexa":
|
elif config["obi"]["qualityformat"][0]=="solexa":
|
||||||
offset=64
|
offset=64
|
||||||
#offset=config["obi"]["qualityoffset"] # TODO discuss
|
#offset=config["obi"]["offset"] # TODO discuss
|
||||||
except KeyError:
|
except KeyError:
|
||||||
offset=33
|
offset=33
|
||||||
|
|
||||||
@ -304,10 +310,10 @@ def open_uri(uri,
|
|||||||
raise MalformedURIException('Malformed header argument in URI')
|
raise MalformedURIException('Malformed header argument in URI')
|
||||||
|
|
||||||
if b"sep" in qualifiers:
|
if b"sep" in qualifiers:
|
||||||
sep=qualifiers[b"sep"][0][0]
|
sep=tobytes(qualifiers[b"sep"][0][0])
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
sep=config["obi"]["sep"]
|
sep=tobytes(config["obi"]["sep"])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
sep=None
|
sep=None
|
||||||
|
|
||||||
@ -315,18 +321,18 @@ def open_uri(uri,
|
|||||||
# pass
|
# pass
|
||||||
|
|
||||||
if b"dec" in qualifiers:
|
if b"dec" in qualifiers:
|
||||||
dec=qualifiers[b"dec"][0][0]
|
dec=tobytes(qualifiers[b"dec"][0][0])
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
dec=config["obi"]["dec"]
|
dec=tobytes(config["obi"]["dec"])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
dec=b"."
|
dec=b"."
|
||||||
|
|
||||||
if b"nastring" in qualifiers:
|
if b"nastring" in qualifiers:
|
||||||
nastring=qualifiers[b"nastring"][0]
|
nastring=tobytes(qualifiers[b"nastring"][0])
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
nastring=config["obi"]["nastring"]
|
nastring=tobytes(config["obi"]["nastring"])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
nastring=b'NA'
|
nastring=b'NA'
|
||||||
|
|
||||||
@ -357,15 +363,15 @@ def open_uri(uri,
|
|||||||
raise MalformedURIException('Malformed blanklineskip argument in URI')
|
raise MalformedURIException('Malformed blanklineskip argument in URI')
|
||||||
|
|
||||||
if b"commentchar" in qualifiers:
|
if b"commentchar" in qualifiers:
|
||||||
commentchar=qualifiers[b"commentchar"][0][0]
|
commentchar=tobytes(qualifiers[b"commentchar"][0][0])
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
commentchar=config["obi"]["commentchar"]
|
commentchar=tobytes(config["obi"]["commentchar"])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
commentchar=b'#'
|
commentchar=b'#'
|
||||||
|
|
||||||
if format is not None:
|
if format is not None:
|
||||||
if qualifiers[b"seqtype"]==b"nuc":
|
if seqtype==b"nuc":
|
||||||
objclass = Nuc_Seq
|
objclass = Nuc_Seq
|
||||||
if format==b"fasta":
|
if format==b"fasta":
|
||||||
iseq = fastaNucIterator(file,
|
iseq = fastaNucIterator(file,
|
||||||
@ -379,8 +385,29 @@ def open_uri(uri,
|
|||||||
noquality=noquality)
|
noquality=noquality)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError('Sequence file format not implemented')
|
raise NotImplementedError('Sequence file format not implemented')
|
||||||
elif qualifiers[b"seqtype"]==b"prot":
|
elif seqtype==b"prot":
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
elif format==b"tabular":
|
||||||
|
objclass = dict
|
||||||
|
iseq = tabIterator(file,
|
||||||
|
header = header,
|
||||||
|
sep = sep,
|
||||||
|
dec = dec,
|
||||||
|
stripwhite = stripwhite,
|
||||||
|
blanklineskip = blanklineskip,
|
||||||
|
commentchar = commentchar,
|
||||||
|
skip = skip,
|
||||||
|
only = only)
|
||||||
|
elif format==b"ngsfilter":
|
||||||
|
objclass = dict
|
||||||
|
iseq = ngsfilterIterator(file,
|
||||||
|
sep = sep,
|
||||||
|
dec = dec,
|
||||||
|
stripwhite = stripwhite,
|
||||||
|
blanklineskip = blanklineskip,
|
||||||
|
commentchar = commentchar,
|
||||||
|
skip = skip,
|
||||||
|
only = only)
|
||||||
else:
|
else:
|
||||||
iseq,objclass = entryIteratorFactory(file,
|
iseq,objclass = entryIteratorFactory(file,
|
||||||
skip, only,
|
skip, only,
|
||||||
|
Reference in New Issue
Block a user