Cython: added full handling of NA strings when importing files

This commit is contained in:
Celine Mercier
2018-10-17 16:41:15 +02:00
parent da76f911db
commit da0e3d4043
11 changed files with 43 additions and 32 deletions

View File

@ -93,7 +93,8 @@ def fastaNucIterator(lineiterator,
int skip=0,
only=None,
firstline=None,
int buffersize=100000000
int buffersize=100000000,
bytes nastring=b"NA"
):
cdef bytes ident
@ -143,7 +144,7 @@ def fastaNucIterator(lineiterator,
pass
skipped += 1
ident,tags,definition = parseHeader(line)
ident,tags,definition = parseHeader(line, nastring=nastring)
s = []
line = next(iterator)

View File

@ -15,19 +15,22 @@ def fastqIterator(lineiterator,
int offset=-1,
bint noquality=False,
firstline=None,
int buffersize=100000000
int buffersize=100000000,
bytes nastring=b"NA"
):
if noquality:
return fastqWithoutQualityIterator(lineiterator,
skip,only,
firstline,
buffersize)
buffersize,
nastring)
else:
return fastqWithQualityIterator(lineiterator,
skip,only,
offset,
firstline,
buffersize)
buffersize,
nastring)
def fastqWithQualityIterator(lineiterator,
@ -35,7 +38,8 @@ def fastqWithQualityIterator(lineiterator,
only=None,
int offset=-1,
firstline=None,
int buffersize=100000000
int buffersize=100000000,
bytes nastring=b"NA"
):
cdef LineBuffer lb
@ -84,7 +88,7 @@ def fastqWithQualityIterator(lineiterator,
if ionly >= 0 and read >= ionly:
break
ident,tags,definition = parseHeader(hline)
ident,tags,definition = parseHeader(hline, nastring=nastring)
sequence = line[0:-1]
next(i)
quality = next(i)[0:-1]
@ -106,7 +110,8 @@ def fastqWithoutQualityIterator(lineiterator,
int skip=0,
only=None,
firstline=None,
int buffersize=100000000
int buffersize=100000000,
bytes nastring=b"NA"
):
cdef bytes ident
cdef bytes definition
@ -154,7 +159,7 @@ def fastqWithoutQualityIterator(lineiterator,
if ionly >= 0 and read >= ionly:
break
ident,tags,definition = parseHeader(hline)
ident,tags,definition = parseHeader(hline, nastring=nastring)
sequence = line[0:-1]
next(i)
next(i)

View File

@ -1,4 +1,4 @@
#cython: language_level=3
cpdef tuple parseHeader(bytes header)
cpdef tuple parseHeader(bytes header, bytes nastring=*)

View File

@ -13,7 +13,7 @@ import re
__ret__ = re.compile(b'''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
cpdef tuple parseHeader(bytes header):
cpdef tuple parseHeader(bytes header, bytes nastring=b"NA"):
cdef list m
cdef dict tags
cdef bytes definition
@ -34,7 +34,7 @@ cpdef tuple parseHeader(bytes header):
m = __ret__.findall(second)
if m:
tags = dict([(a[1],__etag__(a[2])) for a in m])
tags = dict([(a[1],__etag__(a[2], nastring=nastring)) for a in m])
definition = second.split(m[-1][0],1)[1].strip()
else:
tags = {}

View File

@ -16,6 +16,7 @@ def ngsfilterIterator(lineiterator,
bint stripwhite=True,
bint blanklineskip=True,
bytes commentchar=b"#",
bytes nastring=b"NA",
int skip=0,
only=None,
firstline=None,
@ -69,6 +70,7 @@ def ngsfilterIterator(lineiterator,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
nastring = nastring,
skip = skip,
only = only,
firstline = None)

View File

@ -17,6 +17,7 @@ def tabIterator(lineiterator,
bint stripwhite=True,
bint blanklineskip=True,
bytes commentchar=b"#",
bytes nastring=b"NA",
int skip=0,
only=None,
firstline=None,
@ -89,11 +90,10 @@ def tabIterator(lineiterator,
data = [x.strip() for x in data]
for i in range(len(data)):
if key_types:
type_func = key_types[i]
if key_types: # TODO handle None when key types are actually read
view_line[keys[i]] = key_types[i](data[i])
else:
type_func = __etag__
view_line[keys[i]] = type_func(data[i])
view_line[keys[i]] = __etag__(data[i], nastring=nastring)
yield view_line

View File

@ -81,7 +81,8 @@ def entryIteratorFactory(lineiterator,
return (fastaNucIterator(lineiterator,
skip=skip,only=only,
firstline=first,
buffersize=buffersize),
buffersize=buffersize,
nastring=nastring),
Nuc_Seq)
else:
raise NotImplementedError()
@ -91,7 +92,8 @@ def entryIteratorFactory(lineiterator,
offset=offset,
noquality=noquality,
firstline=first,
buffersize=buffersize),
buffersize=buffersize,
nastring=nastring),
Nuc_Seq)
elif format==b'tabular':
return (tabIterator(lineiterator,
@ -101,6 +103,7 @@ def entryIteratorFactory(lineiterator,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
nastring=nastring,
skip = skip,
only = only,
firstline=first,
@ -113,6 +116,7 @@ def entryIteratorFactory(lineiterator,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
nastring=nastring,
skip = skip,
only = only,
firstline=first,