Cython: added full handling of NA strings when importing files

This commit is contained in:
Celine Mercier
2018-10-17 16:41:15 +02:00
parent da76f911db
commit da0e3d4043
11 changed files with 43 additions and 32 deletions

View File

@ -83,7 +83,6 @@ def run(config):
cdef list old_elements_names cdef list old_elements_names
cdef list new_elements_names cdef list new_elements_names
cdef ProgressBar pb cdef ProgressBar pb
cdef bytes NA_value
global obi_errno global obi_errno
DMS.obi_atexit() DMS.obi_atexit()
@ -122,9 +121,7 @@ def run(config):
pb = ProgressBar(10000000, config, seconde=5) # TODO should be number of records in file pb = ProgressBar(10000000, config, seconde=5) # TODO should be number of records in file
entries = input[1] entries = input[1]
NA_value = tobytes(config['obi']['inputnastring'])
NUC_SEQS_view = False NUC_SEQS_view = False
if isinstance(output[1], View) : if isinstance(output[1], View) :
view = output[1] view = output[1]
@ -169,11 +166,7 @@ def run(config):
value = entry[tag] value = entry[tag]
if tag == b"taxid": if tag == b"taxid":
tag = b"TAXID" tag = b"TAXID"
# Check NA value
if value == NA_value :
value = None
if tag not in dcols : if tag not in dcols :
value_type = type(value) value_type = type(value)

View File

@ -93,7 +93,8 @@ def fastaNucIterator(lineiterator,
int skip=0, int skip=0,
only=None, only=None,
firstline=None, firstline=None,
int buffersize=100000000 int buffersize=100000000,
bytes nastring=b"NA"
): ):
cdef bytes ident cdef bytes ident
@ -143,7 +144,7 @@ def fastaNucIterator(lineiterator,
pass pass
skipped += 1 skipped += 1
ident,tags,definition = parseHeader(line) ident,tags,definition = parseHeader(line, nastring=nastring)
s = [] s = []
line = next(iterator) line = next(iterator)

View File

@ -15,19 +15,22 @@ def fastqIterator(lineiterator,
int offset=-1, int offset=-1,
bint noquality=False, bint noquality=False,
firstline=None, firstline=None,
int buffersize=100000000 int buffersize=100000000,
bytes nastring=b"NA"
): ):
if noquality: if noquality:
return fastqWithoutQualityIterator(lineiterator, return fastqWithoutQualityIterator(lineiterator,
skip,only, skip,only,
firstline, firstline,
buffersize) buffersize,
nastring)
else: else:
return fastqWithQualityIterator(lineiterator, return fastqWithQualityIterator(lineiterator,
skip,only, skip,only,
offset, offset,
firstline, firstline,
buffersize) buffersize,
nastring)
def fastqWithQualityIterator(lineiterator, def fastqWithQualityIterator(lineiterator,
@ -35,7 +38,8 @@ def fastqWithQualityIterator(lineiterator,
only=None, only=None,
int offset=-1, int offset=-1,
firstline=None, firstline=None,
int buffersize=100000000 int buffersize=100000000,
bytes nastring=b"NA"
): ):
cdef LineBuffer lb cdef LineBuffer lb
@ -84,7 +88,7 @@ def fastqWithQualityIterator(lineiterator,
if ionly >= 0 and read >= ionly: if ionly >= 0 and read >= ionly:
break break
ident,tags,definition = parseHeader(hline) ident,tags,definition = parseHeader(hline, nastring=nastring)
sequence = line[0:-1] sequence = line[0:-1]
next(i) next(i)
quality = next(i)[0:-1] quality = next(i)[0:-1]
@ -106,7 +110,8 @@ def fastqWithoutQualityIterator(lineiterator,
int skip=0, int skip=0,
only=None, only=None,
firstline=None, firstline=None,
int buffersize=100000000 int buffersize=100000000,
bytes nastring=b"NA"
): ):
cdef bytes ident cdef bytes ident
cdef bytes definition cdef bytes definition
@ -154,7 +159,7 @@ def fastqWithoutQualityIterator(lineiterator,
if ionly >= 0 and read >= ionly: if ionly >= 0 and read >= ionly:
break break
ident,tags,definition = parseHeader(hline) ident,tags,definition = parseHeader(hline, nastring=nastring)
sequence = line[0:-1] sequence = line[0:-1]
next(i) next(i)
next(i) next(i)

View File

@ -1,4 +1,4 @@
#cython: language_level=3 #cython: language_level=3
cpdef tuple parseHeader(bytes header) cpdef tuple parseHeader(bytes header, bytes nastring=*)

View File

@ -13,7 +13,7 @@ import re
__ret__ = re.compile(b'''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''') __ret__ = re.compile(b'''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
cpdef tuple parseHeader(bytes header): cpdef tuple parseHeader(bytes header, bytes nastring=b"NA"):
cdef list m cdef list m
cdef dict tags cdef dict tags
cdef bytes definition cdef bytes definition
@ -34,7 +34,7 @@ cpdef tuple parseHeader(bytes header):
m = __ret__.findall(second) m = __ret__.findall(second)
if m: if m:
tags = dict([(a[1],__etag__(a[2])) for a in m]) tags = dict([(a[1],__etag__(a[2], nastring=nastring)) for a in m])
definition = second.split(m[-1][0],1)[1].strip() definition = second.split(m[-1][0],1)[1].strip()
else: else:
tags = {} tags = {}

View File

@ -16,6 +16,7 @@ def ngsfilterIterator(lineiterator,
bint stripwhite=True, bint stripwhite=True,
bint blanklineskip=True, bint blanklineskip=True,
bytes commentchar=b"#", bytes commentchar=b"#",
bytes nastring=b"NA",
int skip=0, int skip=0,
only=None, only=None,
firstline=None, firstline=None,
@ -69,6 +70,7 @@ def ngsfilterIterator(lineiterator,
stripwhite = stripwhite, stripwhite = stripwhite,
blanklineskip = blanklineskip, blanklineskip = blanklineskip,
commentchar = commentchar, commentchar = commentchar,
nastring = nastring,
skip = skip, skip = skip,
only = only, only = only,
firstline = None) firstline = None)

View File

@ -17,6 +17,7 @@ def tabIterator(lineiterator,
bint stripwhite=True, bint stripwhite=True,
bint blanklineskip=True, bint blanklineskip=True,
bytes commentchar=b"#", bytes commentchar=b"#",
bytes nastring=b"NA",
int skip=0, int skip=0,
only=None, only=None,
firstline=None, firstline=None,
@ -89,11 +90,10 @@ def tabIterator(lineiterator,
data = [x.strip() for x in data] data = [x.strip() for x in data]
for i in range(len(data)): for i in range(len(data)):
if key_types: if key_types: # TODO handle None when key types are actually read
type_func = key_types[i] view_line[keys[i]] = key_types[i](data[i])
else: else:
type_func = __etag__ view_line[keys[i]] = __etag__(data[i], nastring=nastring)
view_line[keys[i]] = type_func(data[i])
yield view_line yield view_line

View File

@ -81,7 +81,8 @@ def entryIteratorFactory(lineiterator,
return (fastaNucIterator(lineiterator, return (fastaNucIterator(lineiterator,
skip=skip,only=only, skip=skip,only=only,
firstline=first, firstline=first,
buffersize=buffersize), buffersize=buffersize,
nastring=nastring),
Nuc_Seq) Nuc_Seq)
else: else:
raise NotImplementedError() raise NotImplementedError()
@ -91,7 +92,8 @@ def entryIteratorFactory(lineiterator,
offset=offset, offset=offset,
noquality=noquality, noquality=noquality,
firstline=first, firstline=first,
buffersize=buffersize), buffersize=buffersize,
nastring=nastring),
Nuc_Seq) Nuc_Seq)
elif format==b'tabular': elif format==b'tabular':
return (tabIterator(lineiterator, return (tabIterator(lineiterator,
@ -101,6 +103,7 @@ def entryIteratorFactory(lineiterator,
stripwhite = stripwhite, stripwhite = stripwhite,
blanklineskip = blanklineskip, blanklineskip = blanklineskip,
commentchar = commentchar, commentchar = commentchar,
nastring=nastring,
skip = skip, skip = skip,
only = only, only = only,
firstline=first, firstline=first,
@ -113,6 +116,7 @@ def entryIteratorFactory(lineiterator,
stripwhite = stripwhite, stripwhite = stripwhite,
blanklineskip = blanklineskip, blanklineskip = blanklineskip,
commentchar = commentchar, commentchar = commentchar,
nastring=nastring,
skip = skip, skip = skip,
only = only, only = only,
firstline=first, firstline=first,

View File

@ -425,7 +425,8 @@ def open_uri(uri,
if input: if input:
iseq = fastaNucIterator(file, iseq = fastaNucIterator(file,
skip=skip, skip=skip,
only=only) only=only,
nastring=nastring)
else: else:
iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring),
file, file,
@ -437,7 +438,8 @@ def open_uri(uri,
skip=skip, skip=skip,
only=only, only=only,
offset=offset, offset=offset,
noquality=noquality) noquality=noquality,
nastring=nastring)
else: else:
iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring), iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring),
file, file,
@ -464,6 +466,7 @@ def open_uri(uri,
stripwhite = stripwhite, stripwhite = stripwhite,
blanklineskip = blanklineskip, blanklineskip = blanklineskip,
commentchar = commentchar, commentchar = commentchar,
nastring=nastring,
skip = skip, skip = skip,
only = only) only = only)
else: else:
@ -477,6 +480,7 @@ def open_uri(uri,
stripwhite = stripwhite, stripwhite = stripwhite,
blanklineskip = blanklineskip, blanklineskip = blanklineskip,
commentchar = commentchar, commentchar = commentchar,
nastring=nastring,
skip = skip, skip = skip,
only = only) only = only)
else: else:

View File

@ -18,4 +18,4 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value)
cdef obitype_t get_obitype_iterable_value(object value) cdef obitype_t get_obitype_iterable_value(object value)
cdef obitype_t get_obitype(object value) cdef obitype_t get_obitype(object value)
cdef object __etag__(bytes x) cdef object __etag__(bytes x, bytes nastring=*)

View File

@ -247,11 +247,13 @@ __re_dict__ = re.compile(b"""^\{\ *
__re_val__ = re.compile(b"""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""") __re_val__ = re.compile(b"""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""")
cdef object __etag__(bytes x): cdef object __etag__(bytes x, bytes nastring=b"NA"):
cdef list elements cdef list elements
cdef tuple i cdef tuple i
if __re_int__.match(x): if x == nastring:
v = None
elif __re_int__.match(x):
v=int(x) v=int(x)
elif __re_float__.match(x): elif __re_float__.match(x):
v=float(x) v=float(x)