From 6825fc13ab77d47834c34400526db716636076eb Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 21 Mar 2018 16:41:25 +0100 Subject: [PATCH] Cython API: added ngsfilter file parser --- python/obitools3/parsers/ngsfilter.pxd | 8 +++ python/obitools3/parsers/ngsfilter.pyx | 84 ++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 python/obitools3/parsers/ngsfilter.pxd create mode 100644 python/obitools3/parsers/ngsfilter.pyx diff --git a/python/obitools3/parsers/ngsfilter.pxd b/python/obitools3/parsers/ngsfilter.pxd new file mode 100644 index 0000000..4422655 --- /dev/null +++ b/python/obitools3/parsers/ngsfilter.pxd @@ -0,0 +1,8 @@ +#cython: language_level=3 + +from ..utils cimport str2bytes +from ..files.universalopener cimport uopen +from ..files.linebuffer cimport LineBuffer + + + \ No newline at end of file diff --git a/python/obitools3/parsers/ngsfilter.pyx b/python/obitools3/parsers/ngsfilter.pyx new file mode 100644 index 0000000..bdaea16 --- /dev/null +++ b/python/obitools3/parsers/ngsfilter.pyx @@ -0,0 +1,84 @@ +#cython: language_level=3 + +''' +Created on march 8th 2018 + +@author: cmercier +''' + +from .tab import tabIterator +from obitools3.utils cimport bytes2str +import types + + +def ngsfilterIterator(lineiterator, + bytes sep = None, + bytes dec = b".", + bint stripwhite=True, + bint blanklineskip=True, + bytes commentchar=b"#", + int skip=0, + only=None, + firstline=None, + int buffersize=100000000 + ): + + cdef list all_lines + cdef str header + cdef str sep_str + cdef bytes out_sep + cdef str out_sep_str + + out_sep = b"\t" + out_sep_str = "\t" + + if sep is not None: + sep_str = bytes2str(sep) + else: + sep_str = None + + if isinstance(lineiterator, (str, bytes)): + lineiterator=uopen(lineiterator) + if isinstance(lineiterator, LineBuffer): + iterator = iter(lineiterator) + else: + if hasattr(lineiterator, "readlines"): + iterator = iter(LineBuffer(lineiterator, buffersize)) + elif hasattr(lineiterator, '__next__'): + iterator = lineiterator + else: + raise Exception("Invalid line iterator") + + all_lines = [line for line in iterator] + new_lines = [] + + if firstline is not None: + all_lines.insert(0, firstline) + + # Insert header for column names + column_names = ["experiment", "sample", "forward_tag", "reverse_tag", "forward_primer", "reverse_primer"] + header = out_sep_str.join(column_names) + + new_lines.append(header) + + for line in all_lines: + split_line = line.split(sep_str) + tags = split_line.pop(2) + tags = tags.split(":") + if len(tags) == 1: # Forward and reverse tags are the same + tags.append(tags[0]) + split_line.insert(2, tags[0]) + split_line.insert(3, tags[1]) + new_lines.append(out_sep_str.join(split_line[0:6])) + + return tabIterator(iter(new_lines), + header = True, + sep = out_sep, + dec = dec, + stripwhite = stripwhite, + blanklineskip = blanklineskip, + commentchar = commentchar, + skip = skip, + only = only, + firstline = None) +