Cython API: added tabular file parser

2018-03-21 16:41:09 +01:00
parent 2684535e26
commit 49c17ab7b4
2 changed files with 122 additions and 0 deletions
--- a/python/obitools3/parsers/tab.pxd
+++ b/python/obitools3/parsers/tab.pxd
@ -0,0 +1,8 @@
+#cython: language_level=3
+
+from ..utils cimport str2bytes
+from ..files.universalopener cimport uopen
+from ..files.linebuffer cimport LineBuffer
+
+
+    
--- a/python/obitools3/parsers/tab.pyx
+++ b/python/obitools3/parsers/tab.pyx
@ -0,0 +1,114 @@
+#cython: language_level=3
+
+'''
+Created on feb 20th 2018
+
+@author: cmercier
+'''
+
+import types
+from obitools3.utils cimport bytes2str, tobytes
+from obitools3.utils cimport __etag__
+
+
+def tabIterator(lineiterator, 
+                bint header = False,
+                bytes sep = None,
+                bytes dec = b".",          # TODO don't know how to use this to parse
+                bint stripwhite=True,
+                bint blanklineskip=True,
+                bytes commentchar=b"#",
+                int skip=0,
+                only=None,
+                firstline=None,
+                int buffersize=100000000
+                ):
+    
+    cdef LineBuffer lb
+    cdef int        lines_to_skip, ionly, read
+    cdef list       data
+    cdef dict       view_line
+    cdef str        sep_str    # TODO can't we read file lines as bytes?
+    cdef list       keys
+    cdef list       key_types
+    
+    if sep is not None:
+        sep_str = bytes2str(sep)
+    else:
+        sep_str = None
+
+    commentchar_str = bytes2str(commentchar)
+    
+    keys = []
+    key_types = []
+    skipped = 0
+    read = 0
+    
+    if only is None:
+        ionly = -1
+    else:
+        ionly = int(only)
+        
+    if isinstance(lineiterator, (str, bytes)):
+        lineiterator=uopen(lineiterator)        
+    if isinstance(lineiterator, LineBuffer):
+        iterator = iter(lineiterator)
+    else:
+        if hasattr(lineiterator, "readlines"):
+            iterator = iter(LineBuffer(lineiterator, buffersize))
+        elif hasattr(lineiterator, '__next__'):
+            iterator = lineiterator
+        else:
+            raise Exception("Invalid line iterator")
+    
+    if firstline is None:
+        line = next(iterator)
+    else:
+        line = firstline       
+    
+    while True:
+        
+        if (not line.strip() and blanklineskip) or line[0] == commentchar_str:
+            line = next(iterator)
+        
+        if ionly >= 0 and read >= ionly:
+            break
+
+        if not keys:
+            if header:
+                # TODO read types eventually
+                keys = line.split(sep_str)
+                keys = [tobytes(x.strip()) for x in keys]
+                line = next(iterator)
+                continue
+            else:
+                # TODO ??? default column names? like R?
+                keys = [str(i) for i in range(len(line.split(sep_str)))]
+                
+        while skipped < skip :
+            line = next(iterator)
+            skipped += 1
+
+        view_line = {}
+        
+        # Parse
+        data = line.split(sep_str)
+
+        if stripwhite or key_types:
+            data = [x.strip() for x in data]
+        
+        for i in range(len(data)):
+            if key_types:
+                type_func = key_types[i]
+            else:
+                type_func = __etag__
+            view_line[keys[i]] = type_func(data[i])
+        
+        yield view_line
+        
+        read+=1
+        
+        line = next(iterator)
+    
+        
+