Cython API: added tabular file parser

This commit is contained in:
Celine Mercier
2018-03-21 16:41:09 +01:00
parent 2684535e26
commit 49c17ab7b4
2 changed files with 122 additions and 0 deletions

View File

@ -0,0 +1,8 @@
#cython: language_level=3
from ..utils cimport str2bytes
from ..files.universalopener cimport uopen
from ..files.linebuffer cimport LineBuffer

View File

@ -0,0 +1,114 @@
#cython: language_level=3
'''
Created on feb 20th 2018
@author: cmercier
'''
import types
from obitools3.utils cimport bytes2str, tobytes
from obitools3.utils cimport __etag__
def tabIterator(lineiterator,
bint header = False,
bytes sep = None,
bytes dec = b".", # TODO don't know how to use this to parse
bint stripwhite=True,
bint blanklineskip=True,
bytes commentchar=b"#",
int skip=0,
only=None,
firstline=None,
int buffersize=100000000
):
cdef LineBuffer lb
cdef int lines_to_skip, ionly, read
cdef list data
cdef dict view_line
cdef str sep_str # TODO can't we read file lines as bytes?
cdef list keys
cdef list key_types
if sep is not None:
sep_str = bytes2str(sep)
else:
sep_str = None
commentchar_str = bytes2str(commentchar)
keys = []
key_types = []
skipped = 0
read = 0
if only is None:
ionly = -1
else:
ionly = int(only)
if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
iterator = iter(lineiterator)
else:
if hasattr(lineiterator, "readlines"):
iterator = iter(LineBuffer(lineiterator, buffersize))
elif hasattr(lineiterator, '__next__'):
iterator = lineiterator
else:
raise Exception("Invalid line iterator")
if firstline is None:
line = next(iterator)
else:
line = firstline
while True:
if (not line.strip() and blanklineskip) or line[0] == commentchar_str:
line = next(iterator)
if ionly >= 0 and read >= ionly:
break
if not keys:
if header:
# TODO read types eventually
keys = line.split(sep_str)
keys = [tobytes(x.strip()) for x in keys]
line = next(iterator)
continue
else:
# TODO ??? default column names? like R?
keys = [str(i) for i in range(len(line.split(sep_str)))]
while skipped < skip :
line = next(iterator)
skipped += 1
view_line = {}
# Parse
data = line.split(sep_str)
if stripwhite or key_types:
data = [x.strip() for x in data]
for i in range(len(data)):
if key_types:
type_func = key_types[i]
else:
type_func = __etag__
view_line[keys[i]] = type_func(data[i])
yield view_line
read+=1
line = next(iterator)