diff --git a/python/obitools3/parsers/tab.pxd b/python/obitools3/parsers/tab.pxd new file mode 100644 index 0000000..4422655 --- /dev/null +++ b/python/obitools3/parsers/tab.pxd @@ -0,0 +1,8 @@ +#cython: language_level=3 + +from ..utils cimport str2bytes +from ..files.universalopener cimport uopen +from ..files.linebuffer cimport LineBuffer + + + \ No newline at end of file diff --git a/python/obitools3/parsers/tab.pyx b/python/obitools3/parsers/tab.pyx new file mode 100644 index 0000000..fda8385 --- /dev/null +++ b/python/obitools3/parsers/tab.pyx @@ -0,0 +1,114 @@ +#cython: language_level=3 + +''' +Created on feb 20th 2018 + +@author: cmercier +''' + +import types +from obitools3.utils cimport bytes2str, tobytes +from obitools3.utils cimport __etag__ + + +def tabIterator(lineiterator, + bint header = False, + bytes sep = None, + bytes dec = b".", # TODO don't know how to use this to parse + bint stripwhite=True, + bint blanklineskip=True, + bytes commentchar=b"#", + int skip=0, + only=None, + firstline=None, + int buffersize=100000000 + ): + + cdef LineBuffer lb + cdef int lines_to_skip, ionly, read + cdef list data + cdef dict view_line + cdef str sep_str # TODO can't we read file lines as bytes? + cdef list keys + cdef list key_types + + if sep is not None: + sep_str = bytes2str(sep) + else: + sep_str = None + + commentchar_str = bytes2str(commentchar) + + keys = [] + key_types = [] + skipped = 0 + read = 0 + + if only is None: + ionly = -1 + else: + ionly = int(only) + + if isinstance(lineiterator, (str, bytes)): + lineiterator=uopen(lineiterator) + if isinstance(lineiterator, LineBuffer): + iterator = iter(lineiterator) + else: + if hasattr(lineiterator, "readlines"): + iterator = iter(LineBuffer(lineiterator, buffersize)) + elif hasattr(lineiterator, '__next__'): + iterator = lineiterator + else: + raise Exception("Invalid line iterator") + + if firstline is None: + line = next(iterator) + else: + line = firstline + + while True: + + if (not line.strip() and blanklineskip) or line[0] == commentchar_str: + line = next(iterator) + + if ionly >= 0 and read >= ionly: + break + + if not keys: + if header: + # TODO read types eventually + keys = line.split(sep_str) + keys = [tobytes(x.strip()) for x in keys] + line = next(iterator) + continue + else: + # TODO ??? default column names? like R? + keys = [str(i) for i in range(len(line.split(sep_str)))] + + while skipped < skip : + line = next(iterator) + skipped += 1 + + view_line = {} + + # Parse + data = line.split(sep_str) + + if stripwhite or key_types: + data = [x.strip() for x in data] + + for i in range(len(data)): + if key_types: + type_func = key_types[i] + else: + type_func = __etag__ + view_line[keys[i]] = type_func(data[i]) + + yield view_line + + read+=1 + + line = next(iterator) + + + \ No newline at end of file