From 1cd35b33591c71340d5fc545a5c49e13ac7b5c4a Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 31 Mar 2016 10:47:12 +0200 Subject: [PATCH] firt version of a fastq parser --- python/obitools3/parsers/__init__.py | 0 python/obitools3/parsers/fastq.pxd | 8 +++++ python/obitools3/parsers/fastq.pyx | 41 ++++++++++++++++++++++++ python/obitools3/parsers/header.pxd | 5 +++ python/obitools3/parsers/header.pyx | 47 ++++++++++++++++++++++++++++ 5 files changed, 101 insertions(+) create mode 100644 python/obitools3/parsers/__init__.py create mode 100644 python/obitools3/parsers/fastq.pxd create mode 100644 python/obitools3/parsers/fastq.pyx create mode 100644 python/obitools3/parsers/header.pxd create mode 100644 python/obitools3/parsers/header.pyx diff --git a/python/obitools3/parsers/__init__.py b/python/obitools3/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/obitools3/parsers/fastq.pxd b/python/obitools3/parsers/fastq.pxd new file mode 100644 index 0000000..d1546d1 --- /dev/null +++ b/python/obitools3/parsers/fastq.pxd @@ -0,0 +1,8 @@ +#cython: language_level=3 + +from .header cimport parseHeader +from ..files.universalopener cimport uopen +from ..files.linebuffer cimport LineBuffer + + + \ No newline at end of file diff --git a/python/obitools3/parsers/fastq.pyx b/python/obitools3/parsers/fastq.pyx new file mode 100644 index 0000000..0e44cf8 --- /dev/null +++ b/python/obitools3/parsers/fastq.pyx @@ -0,0 +1,41 @@ +#cython: language_level=3 + +''' +Created on 30 mars 2016 + +@author: coissac +''' + + + +def fastqIterator(lineiterator, int buffersize=100000000): + cdef LineBuffer lb + cdef str ident + cdef str definition + cdef dict tags + + if isinstance(lineiterator,(str,bytes)): + lineiterator=uopen(lineiterator) + + if isinstance(lineiterator, LineBuffer): + lb=lineiterator + else: + lb=LineBuffer(lineiterator,buffersize) + + i = iter(lb) + for line in i: + ident,tags,definition = parseHeader(line) + sequence = next(i)[0:-1] + next(i) + quality = next(i)[0:-1] + + yield { "id" : ident, + "definition" : definition, + "sequence" : sequence, + "quality" : quality, + "tags" : tags, + "annotation" : {} + } + + + \ No newline at end of file diff --git a/python/obitools3/parsers/header.pxd b/python/obitools3/parsers/header.pxd new file mode 100644 index 0000000..b09a418 --- /dev/null +++ b/python/obitools3/parsers/header.pxd @@ -0,0 +1,5 @@ +#cython: language_level=3 + +cdef object __etag__(str x) + +cpdef tuple parseHeader(str header) diff --git a/python/obitools3/parsers/header.pyx b/python/obitools3/parsers/header.pyx new file mode 100644 index 0000000..e0099e6 --- /dev/null +++ b/python/obitools3/parsers/header.pyx @@ -0,0 +1,47 @@ +#cython: language_level=3 + +''' +Created on 25 mars 2016 + +@author: coissac +''' + +import re + +__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''') + +cdef object __etag__(str x): + try: + v = eval(x,{},{}) + except: + v = x + return v + +cpdef tuple parseHeader(str header): + cdef list m + cdef dict tags + cdef str definition + cdef str ident + cdef str second + + m=header[1:-1].split(maxsplit=1) + + ident=m[0] + + if len(m)==1: + tags={} + definition='' + else: + second=m[1] + m = __ret__.findall(second) + + if m: + tags = dict([(a[1],__etag__(a[2])) for a in m]) + definition = second.split(m[-1][0],1)[1].strip() + else: + tags = {} + definition = second.strip() + + return ident,tags,definition + +