Cython API: added tabular file parser
This commit is contained in:
8
python/obitools3/parsers/tab.pxd
Normal file
8
python/obitools3/parsers/tab.pxd
Normal file
@ -0,0 +1,8 @@
|
||||
#cython: language_level=3
|
||||
|
||||
from ..utils cimport str2bytes
|
||||
from ..files.universalopener cimport uopen
|
||||
from ..files.linebuffer cimport LineBuffer
|
||||
|
||||
|
||||
|
114
python/obitools3/parsers/tab.pyx
Normal file
114
python/obitools3/parsers/tab.pyx
Normal file
@ -0,0 +1,114 @@
|
||||
#cython: language_level=3
|
||||
|
||||
'''
|
||||
Created on feb 20th 2018
|
||||
|
||||
@author: cmercier
|
||||
'''
|
||||
|
||||
import types
|
||||
from obitools3.utils cimport bytes2str, tobytes
|
||||
from obitools3.utils cimport __etag__
|
||||
|
||||
|
||||
def tabIterator(lineiterator,
|
||||
bint header = False,
|
||||
bytes sep = None,
|
||||
bytes dec = b".", # TODO don't know how to use this to parse
|
||||
bint stripwhite=True,
|
||||
bint blanklineskip=True,
|
||||
bytes commentchar=b"#",
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
):
|
||||
|
||||
cdef LineBuffer lb
|
||||
cdef int lines_to_skip, ionly, read
|
||||
cdef list data
|
||||
cdef dict view_line
|
||||
cdef str sep_str # TODO can't we read file lines as bytes?
|
||||
cdef list keys
|
||||
cdef list key_types
|
||||
|
||||
if sep is not None:
|
||||
sep_str = bytes2str(sep)
|
||||
else:
|
||||
sep_str = None
|
||||
|
||||
commentchar_str = bytes2str(commentchar)
|
||||
|
||||
keys = []
|
||||
key_types = []
|
||||
skipped = 0
|
||||
read = 0
|
||||
|
||||
if only is None:
|
||||
ionly = -1
|
||||
else:
|
||||
ionly = int(only)
|
||||
|
||||
if isinstance(lineiterator, (str, bytes)):
|
||||
lineiterator=uopen(lineiterator)
|
||||
if isinstance(lineiterator, LineBuffer):
|
||||
iterator = iter(lineiterator)
|
||||
else:
|
||||
if hasattr(lineiterator, "readlines"):
|
||||
iterator = iter(LineBuffer(lineiterator, buffersize))
|
||||
elif hasattr(lineiterator, '__next__'):
|
||||
iterator = lineiterator
|
||||
else:
|
||||
raise Exception("Invalid line iterator")
|
||||
|
||||
if firstline is None:
|
||||
line = next(iterator)
|
||||
else:
|
||||
line = firstline
|
||||
|
||||
while True:
|
||||
|
||||
if (not line.strip() and blanklineskip) or line[0] == commentchar_str:
|
||||
line = next(iterator)
|
||||
|
||||
if ionly >= 0 and read >= ionly:
|
||||
break
|
||||
|
||||
if not keys:
|
||||
if header:
|
||||
# TODO read types eventually
|
||||
keys = line.split(sep_str)
|
||||
keys = [tobytes(x.strip()) for x in keys]
|
||||
line = next(iterator)
|
||||
continue
|
||||
else:
|
||||
# TODO ??? default column names? like R?
|
||||
keys = [str(i) for i in range(len(line.split(sep_str)))]
|
||||
|
||||
while skipped < skip :
|
||||
line = next(iterator)
|
||||
skipped += 1
|
||||
|
||||
view_line = {}
|
||||
|
||||
# Parse
|
||||
data = line.split(sep_str)
|
||||
|
||||
if stripwhite or key_types:
|
||||
data = [x.strip() for x in data]
|
||||
|
||||
for i in range(len(data)):
|
||||
if key_types:
|
||||
type_func = key_types[i]
|
||||
else:
|
||||
type_func = __etag__
|
||||
view_line[keys[i]] = type_func(data[i])
|
||||
|
||||
yield view_line
|
||||
|
||||
read+=1
|
||||
|
||||
line = next(iterator)
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user