Cython: added full handling of NA strings when importing files

This commit is contained in:
Celine Mercier
2018-10-17 16:41:15 +02:00
parent da76f911db
commit da0e3d4043
11 changed files with 43 additions and 32 deletions

View File

@ -83,7 +83,6 @@ def run(config):
cdef list old_elements_names
cdef list new_elements_names
cdef ProgressBar pb
cdef bytes NA_value
global obi_errno
DMS.obi_atexit()
@ -122,9 +121,7 @@ def run(config):
pb = ProgressBar(10000000, config, seconde=5) # TODO should be number of records in file
entries = input[1]
NA_value = tobytes(config['obi']['inputnastring'])
NUC_SEQS_view = False
if isinstance(output[1], View) :
view = output[1]
@ -169,11 +166,7 @@ def run(config):
value = entry[tag]
if tag == b"taxid":
tag = b"TAXID"
# Check NA value
if value == NA_value :
value = None
if tag not in dcols :
value_type = type(value)

View File

@ -93,7 +93,8 @@ def fastaNucIterator(lineiterator,
int skip=0,
only=None,
firstline=None,
int buffersize=100000000
int buffersize=100000000,
bytes nastring=b"NA"
):
cdef bytes ident
@ -143,7 +144,7 @@ def fastaNucIterator(lineiterator,
pass
skipped += 1
ident,tags,definition = parseHeader(line)
ident,tags,definition = parseHeader(line, nastring=nastring)
s = []
line = next(iterator)

View File

@ -15,19 +15,22 @@ def fastqIterator(lineiterator,
int offset=-1,
bint noquality=False,
firstline=None,
int buffersize=100000000
int buffersize=100000000,
bytes nastring=b"NA"
):
if noquality:
return fastqWithoutQualityIterator(lineiterator,
skip,only,
firstline,
buffersize)
buffersize,
nastring)
else:
return fastqWithQualityIterator(lineiterator,
skip,only,
offset,
firstline,
buffersize)
buffersize,
nastring)
def fastqWithQualityIterator(lineiterator,
@ -35,7 +38,8 @@ def fastqWithQualityIterator(lineiterator,
only=None,
int offset=-1,
firstline=None,
int buffersize=100000000
int buffersize=100000000,
bytes nastring=b"NA"
):
cdef LineBuffer lb
@ -84,7 +88,7 @@ def fastqWithQualityIterator(lineiterator,
if ionly >= 0 and read >= ionly:
break
ident,tags,definition = parseHeader(hline)
ident,tags,definition = parseHeader(hline, nastring=nastring)
sequence = line[0:-1]
next(i)
quality = next(i)[0:-1]
@ -106,7 +110,8 @@ def fastqWithoutQualityIterator(lineiterator,
int skip=0,
only=None,
firstline=None,
int buffersize=100000000
int buffersize=100000000,
bytes nastring=b"NA"
):
cdef bytes ident
cdef bytes definition
@ -154,7 +159,7 @@ def fastqWithoutQualityIterator(lineiterator,
if ionly >= 0 and read >= ionly:
break
ident,tags,definition = parseHeader(hline)
ident,tags,definition = parseHeader(hline, nastring=nastring)
sequence = line[0:-1]
next(i)
next(i)

View File

@ -1,4 +1,4 @@
#cython: language_level=3
cpdef tuple parseHeader(bytes header)
cpdef tuple parseHeader(bytes header, bytes nastring=*)

View File

@ -13,7 +13,7 @@ import re
__ret__ = re.compile(b'''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
cpdef tuple parseHeader(bytes header):
cpdef tuple parseHeader(bytes header, bytes nastring=b"NA"):
cdef list m
cdef dict tags
cdef bytes definition
@ -34,7 +34,7 @@ cpdef tuple parseHeader(bytes header):
m = __ret__.findall(second)
if m:
tags = dict([(a[1],__etag__(a[2])) for a in m])
tags = dict([(a[1],__etag__(a[2], nastring=nastring)) for a in m])
definition = second.split(m[-1][0],1)[1].strip()
else:
tags = {}

View File

@ -16,6 +16,7 @@ def ngsfilterIterator(lineiterator,
bint stripwhite=True,
bint blanklineskip=True,
bytes commentchar=b"#",
bytes nastring=b"NA",
int skip=0,
only=None,
firstline=None,
@ -69,6 +70,7 @@ def ngsfilterIterator(lineiterator,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
nastring = nastring,
skip = skip,
only = only,
firstline = None)

View File

@ -17,6 +17,7 @@ def tabIterator(lineiterator,
bint stripwhite=True,
bint blanklineskip=True,
bytes commentchar=b"#",
bytes nastring=b"NA",
int skip=0,
only=None,
firstline=None,
@ -89,11 +90,10 @@ def tabIterator(lineiterator,
data = [x.strip() for x in data]
for i in range(len(data)):
if key_types:
type_func = key_types[i]
if key_types: # TODO handle None when key types are actually read
view_line[keys[i]] = key_types[i](data[i])
else:
type_func = __etag__
view_line[keys[i]] = type_func(data[i])
view_line[keys[i]] = __etag__(data[i], nastring=nastring)
yield view_line

View File

@ -81,7 +81,8 @@ def entryIteratorFactory(lineiterator,
return (fastaNucIterator(lineiterator,
skip=skip,only=only,
firstline=first,
buffersize=buffersize),
buffersize=buffersize,
nastring=nastring),
Nuc_Seq)
else:
raise NotImplementedError()
@ -91,7 +92,8 @@ def entryIteratorFactory(lineiterator,
offset=offset,
noquality=noquality,
firstline=first,
buffersize=buffersize),
buffersize=buffersize,
nastring=nastring),
Nuc_Seq)
elif format==b'tabular':
return (tabIterator(lineiterator,
@ -101,6 +103,7 @@ def entryIteratorFactory(lineiterator,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
nastring=nastring,
skip = skip,
only = only,
firstline=first,
@ -113,6 +116,7 @@ def entryIteratorFactory(lineiterator,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
nastring=nastring,
skip = skip,
only = only,
firstline=first,

View File

@ -425,7 +425,8 @@ def open_uri(uri,
if input:
iseq = fastaNucIterator(file,
skip=skip,
only=only)
only=only,
nastring=nastring)
else:
iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring),
file,
@ -437,7 +438,8 @@ def open_uri(uri,
skip=skip,
only=only,
offset=offset,
noquality=noquality)
noquality=noquality,
nastring=nastring)
else:
iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring),
file,
@ -464,6 +466,7 @@ def open_uri(uri,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
nastring=nastring,
skip = skip,
only = only)
else:
@ -477,6 +480,7 @@ def open_uri(uri,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
nastring=nastring,
skip = skip,
only = only)
else:

View File

@ -18,4 +18,4 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value)
cdef obitype_t get_obitype_iterable_value(object value)
cdef obitype_t get_obitype(object value)
cdef object __etag__(bytes x)
cdef object __etag__(bytes x, bytes nastring=*)

View File

@ -247,11 +247,13 @@ __re_dict__ = re.compile(b"""^\{\ *
__re_val__ = re.compile(b"""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""")
cdef object __etag__(bytes x):
cdef object __etag__(bytes x, bytes nastring=b"NA"):
cdef list elements
cdef tuple i
if __re_int__.match(x):
if x == nastring:
v = None
elif __re_int__.match(x):
v=int(x)
elif __re_float__.match(x):
v=float(x)