diff --git a/python/obitools3/parsers/__init__.py b/python/obitools3/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/obitools3/parsers/fastq.pxd b/python/obitools3/parsers/fastq.pxd new file mode 100644 index 0000000..d1546d1 --- /dev/null +++ b/python/obitools3/parsers/fastq.pxd @@ -0,0 +1,8 @@ +#cython: language_level=3 + +from .header cimport parseHeader +from ..files.universalopener cimport uopen +from ..files.linebuffer cimport LineBuffer + + + \ No newline at end of file diff --git a/python/obitools3/parsers/fastq.pyx b/python/obitools3/parsers/fastq.pyx new file mode 100644 index 0000000..0e44cf8 --- /dev/null +++ b/python/obitools3/parsers/fastq.pyx @@ -0,0 +1,41 @@ +#cython: language_level=3 + +''' +Created on 30 mars 2016 + +@author: coissac +''' + + + +def fastqIterator(lineiterator, int buffersize=100000000): + cdef LineBuffer lb + cdef str ident + cdef str definition + cdef dict tags + + if isinstance(lineiterator,(str,bytes)): + lineiterator=uopen(lineiterator) + + if isinstance(lineiterator, LineBuffer): + lb=lineiterator + else: + lb=LineBuffer(lineiterator,buffersize) + + i = iter(lb) + for line in i: + ident,tags,definition = parseHeader(line) + sequence = next(i)[0:-1] + next(i) + quality = next(i)[0:-1] + + yield { "id" : ident, + "definition" : definition, + "sequence" : sequence, + "quality" : quality, + "tags" : tags, + "annotation" : {} + } + + + \ No newline at end of file diff --git a/python/obitools3/parsers/header.pxd b/python/obitools3/parsers/header.pxd new file mode 100644 index 0000000..b09a418 --- /dev/null +++ b/python/obitools3/parsers/header.pxd @@ -0,0 +1,5 @@ +#cython: language_level=3 + +cdef object __etag__(str x) + +cpdef tuple parseHeader(str header) diff --git a/python/obitools3/parsers/header.pyx b/python/obitools3/parsers/header.pyx new file mode 100644 index 0000000..e0099e6 --- /dev/null +++ b/python/obitools3/parsers/header.pyx @@ -0,0 +1,47 @@ +#cython: language_level=3 + +''' +Created on 25 mars 2016 + +@author: coissac +''' + +import re + +__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''') + +cdef object __etag__(str x): + try: + v = eval(x,{},{}) + except: + v = x + return v + +cpdef tuple parseHeader(str header): + cdef list m + cdef dict tags + cdef str definition + cdef str ident + cdef str second + + m=header[1:-1].split(maxsplit=1) + + ident=m[0] + + if len(m)==1: + tags={} + definition='' + else: + second=m[1] + m = __ret__.findall(second) + + if m: + tags = dict([(a[1],__etag__(a[2])) for a in m]) + definition = second.split(m[-1][0],1)[1].strip() + else: + tags = {} + definition = second.strip() + + return ident,tags,definition + +