Cython: added full handling of NA strings when importing files
This commit is contained in:
@ -83,7 +83,6 @@ def run(config):
|
||||
cdef list old_elements_names
|
||||
cdef list new_elements_names
|
||||
cdef ProgressBar pb
|
||||
cdef bytes NA_value
|
||||
global obi_errno
|
||||
|
||||
DMS.obi_atexit()
|
||||
@ -122,9 +121,7 @@ def run(config):
|
||||
pb = ProgressBar(10000000, config, seconde=5) # TODO should be number of records in file
|
||||
|
||||
entries = input[1]
|
||||
|
||||
NA_value = tobytes(config['obi']['inputnastring'])
|
||||
|
||||
|
||||
NUC_SEQS_view = False
|
||||
if isinstance(output[1], View) :
|
||||
view = output[1]
|
||||
@ -169,11 +166,7 @@ def run(config):
|
||||
value = entry[tag]
|
||||
if tag == b"taxid":
|
||||
tag = b"TAXID"
|
||||
|
||||
# Check NA value
|
||||
if value == NA_value :
|
||||
value = None
|
||||
|
||||
|
||||
if tag not in dcols :
|
||||
|
||||
value_type = type(value)
|
||||
|
@ -93,7 +93,8 @@ def fastaNucIterator(lineiterator,
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
int buffersize=100000000,
|
||||
bytes nastring=b"NA"
|
||||
):
|
||||
|
||||
cdef bytes ident
|
||||
@ -143,7 +144,7 @@ def fastaNucIterator(lineiterator,
|
||||
pass
|
||||
skipped += 1
|
||||
|
||||
ident,tags,definition = parseHeader(line)
|
||||
ident,tags,definition = parseHeader(line, nastring=nastring)
|
||||
s = []
|
||||
line = next(iterator)
|
||||
|
||||
|
@ -15,19 +15,22 @@ def fastqIterator(lineiterator,
|
||||
int offset=-1,
|
||||
bint noquality=False,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
int buffersize=100000000,
|
||||
bytes nastring=b"NA"
|
||||
):
|
||||
if noquality:
|
||||
return fastqWithoutQualityIterator(lineiterator,
|
||||
skip,only,
|
||||
firstline,
|
||||
buffersize)
|
||||
buffersize,
|
||||
nastring)
|
||||
else:
|
||||
return fastqWithQualityIterator(lineiterator,
|
||||
skip,only,
|
||||
offset,
|
||||
firstline,
|
||||
buffersize)
|
||||
buffersize,
|
||||
nastring)
|
||||
|
||||
|
||||
def fastqWithQualityIterator(lineiterator,
|
||||
@ -35,7 +38,8 @@ def fastqWithQualityIterator(lineiterator,
|
||||
only=None,
|
||||
int offset=-1,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
int buffersize=100000000,
|
||||
bytes nastring=b"NA"
|
||||
):
|
||||
|
||||
cdef LineBuffer lb
|
||||
@ -84,7 +88,7 @@ def fastqWithQualityIterator(lineiterator,
|
||||
if ionly >= 0 and read >= ionly:
|
||||
break
|
||||
|
||||
ident,tags,definition = parseHeader(hline)
|
||||
ident,tags,definition = parseHeader(hline, nastring=nastring)
|
||||
sequence = line[0:-1]
|
||||
next(i)
|
||||
quality = next(i)[0:-1]
|
||||
@ -106,7 +110,8 @@ def fastqWithoutQualityIterator(lineiterator,
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
int buffersize=100000000,
|
||||
bytes nastring=b"NA"
|
||||
):
|
||||
cdef bytes ident
|
||||
cdef bytes definition
|
||||
@ -154,7 +159,7 @@ def fastqWithoutQualityIterator(lineiterator,
|
||||
if ionly >= 0 and read >= ionly:
|
||||
break
|
||||
|
||||
ident,tags,definition = parseHeader(hline)
|
||||
ident,tags,definition = parseHeader(hline, nastring=nastring)
|
||||
sequence = line[0:-1]
|
||||
next(i)
|
||||
next(i)
|
||||
|
@ -1,4 +1,4 @@
|
||||
#cython: language_level=3
|
||||
|
||||
|
||||
cpdef tuple parseHeader(bytes header)
|
||||
cpdef tuple parseHeader(bytes header, bytes nastring=*)
|
||||
|
@ -13,7 +13,7 @@ import re
|
||||
__ret__ = re.compile(b'''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
|
||||
|
||||
|
||||
cpdef tuple parseHeader(bytes header):
|
||||
cpdef tuple parseHeader(bytes header, bytes nastring=b"NA"):
|
||||
cdef list m
|
||||
cdef dict tags
|
||||
cdef bytes definition
|
||||
@ -34,7 +34,7 @@ cpdef tuple parseHeader(bytes header):
|
||||
m = __ret__.findall(second)
|
||||
|
||||
if m:
|
||||
tags = dict([(a[1],__etag__(a[2])) for a in m])
|
||||
tags = dict([(a[1],__etag__(a[2], nastring=nastring)) for a in m])
|
||||
definition = second.split(m[-1][0],1)[1].strip()
|
||||
else:
|
||||
tags = {}
|
||||
|
@ -16,6 +16,7 @@ def ngsfilterIterator(lineiterator,
|
||||
bint stripwhite=True,
|
||||
bint blanklineskip=True,
|
||||
bytes commentchar=b"#",
|
||||
bytes nastring=b"NA",
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
@ -69,6 +70,7 @@ def ngsfilterIterator(lineiterator,
|
||||
stripwhite = stripwhite,
|
||||
blanklineskip = blanklineskip,
|
||||
commentchar = commentchar,
|
||||
nastring = nastring,
|
||||
skip = skip,
|
||||
only = only,
|
||||
firstline = None)
|
||||
|
@ -17,6 +17,7 @@ def tabIterator(lineiterator,
|
||||
bint stripwhite=True,
|
||||
bint blanklineskip=True,
|
||||
bytes commentchar=b"#",
|
||||
bytes nastring=b"NA",
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
@ -89,11 +90,10 @@ def tabIterator(lineiterator,
|
||||
data = [x.strip() for x in data]
|
||||
|
||||
for i in range(len(data)):
|
||||
if key_types:
|
||||
type_func = key_types[i]
|
||||
if key_types: # TODO handle None when key types are actually read
|
||||
view_line[keys[i]] = key_types[i](data[i])
|
||||
else:
|
||||
type_func = __etag__
|
||||
view_line[keys[i]] = type_func(data[i])
|
||||
view_line[keys[i]] = __etag__(data[i], nastring=nastring)
|
||||
|
||||
yield view_line
|
||||
|
||||
|
@ -81,7 +81,8 @@ def entryIteratorFactory(lineiterator,
|
||||
return (fastaNucIterator(lineiterator,
|
||||
skip=skip,only=only,
|
||||
firstline=first,
|
||||
buffersize=buffersize),
|
||||
buffersize=buffersize,
|
||||
nastring=nastring),
|
||||
Nuc_Seq)
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
@ -91,7 +92,8 @@ def entryIteratorFactory(lineiterator,
|
||||
offset=offset,
|
||||
noquality=noquality,
|
||||
firstline=first,
|
||||
buffersize=buffersize),
|
||||
buffersize=buffersize,
|
||||
nastring=nastring),
|
||||
Nuc_Seq)
|
||||
elif format==b'tabular':
|
||||
return (tabIterator(lineiterator,
|
||||
@ -101,6 +103,7 @@ def entryIteratorFactory(lineiterator,
|
||||
stripwhite = stripwhite,
|
||||
blanklineskip = blanklineskip,
|
||||
commentchar = commentchar,
|
||||
nastring=nastring,
|
||||
skip = skip,
|
||||
only = only,
|
||||
firstline=first,
|
||||
@ -113,6 +116,7 @@ def entryIteratorFactory(lineiterator,
|
||||
stripwhite = stripwhite,
|
||||
blanklineskip = blanklineskip,
|
||||
commentchar = commentchar,
|
||||
nastring=nastring,
|
||||
skip = skip,
|
||||
only = only,
|
||||
firstline=first,
|
||||
|
@ -425,7 +425,8 @@ def open_uri(uri,
|
||||
if input:
|
||||
iseq = fastaNucIterator(file,
|
||||
skip=skip,
|
||||
only=only)
|
||||
only=only,
|
||||
nastring=nastring)
|
||||
else:
|
||||
iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring),
|
||||
file,
|
||||
@ -437,7 +438,8 @@ def open_uri(uri,
|
||||
skip=skip,
|
||||
only=only,
|
||||
offset=offset,
|
||||
noquality=noquality)
|
||||
noquality=noquality,
|
||||
nastring=nastring)
|
||||
else:
|
||||
iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring),
|
||||
file,
|
||||
@ -464,6 +466,7 @@ def open_uri(uri,
|
||||
stripwhite = stripwhite,
|
||||
blanklineskip = blanklineskip,
|
||||
commentchar = commentchar,
|
||||
nastring=nastring,
|
||||
skip = skip,
|
||||
only = only)
|
||||
else:
|
||||
@ -477,6 +480,7 @@ def open_uri(uri,
|
||||
stripwhite = stripwhite,
|
||||
blanklineskip = blanklineskip,
|
||||
commentchar = commentchar,
|
||||
nastring=nastring,
|
||||
skip = skip,
|
||||
only = only)
|
||||
else:
|
||||
|
@ -18,4 +18,4 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value)
|
||||
cdef obitype_t get_obitype_iterable_value(object value)
|
||||
cdef obitype_t get_obitype(object value)
|
||||
|
||||
cdef object __etag__(bytes x)
|
||||
cdef object __etag__(bytes x, bytes nastring=*)
|
||||
|
@ -247,11 +247,13 @@ __re_dict__ = re.compile(b"""^\{\ *
|
||||
|
||||
__re_val__ = re.compile(b"""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""")
|
||||
|
||||
cdef object __etag__(bytes x):
|
||||
cdef object __etag__(bytes x, bytes nastring=b"NA"):
|
||||
cdef list elements
|
||||
cdef tuple i
|
||||
|
||||
if __re_int__.match(x):
|
||||
if x == nastring:
|
||||
v = None
|
||||
elif __re_int__.match(x):
|
||||
v=int(x)
|
||||
elif __re_float__.match(x):
|
||||
v=float(x)
|
||||
|
Reference in New Issue
Block a user