diff --git a/python/obitools3/files/linebuffer.pyx b/python/obitools3/files/linebuffer.pyx index b0dbaf1..3ac070a 100644 --- a/python/obitools3/files/linebuffer.pyx +++ b/python/obitools3/files/linebuffer.pyx @@ -8,13 +8,13 @@ Created on 30 mars 2016 cdef class LineBuffer: - def __init__(self,object fileobj,int size=100000000): + def __init__(self, object fileobj, int size=100000000): self.fileobj=fileobj self.size=size def __iter__(self): cdef list buff = self.fileobj.readlines(self.size) - cdef str l + cdef object l # Can be str or bytes while buff: for l in buff: diff --git a/python/obitools3/files/universalopener.pxd b/python/obitools3/files/universalopener.pxd index 7743db6..2c7373b 100644 --- a/python/obitools3/files/universalopener.pxd +++ b/python/obitools3/files/universalopener.pxd @@ -2,4 +2,4 @@ from .uncompress cimport CompressedFile -cpdef CompressedFile uopen(str name, mode=?) \ No newline at end of file +cpdef CompressedFile uopen(object name, mode=?) \ No newline at end of file diff --git a/python/obitools3/files/universalopener.pyx b/python/obitools3/files/universalopener.pyx index 7c4fdbe..9f0b085 100644 --- a/python/obitools3/files/universalopener.pyx +++ b/python/obitools3/files/universalopener.pyx @@ -7,15 +7,16 @@ Created on 25 mars 2016 ''' from urllib.request import urlopen +from obitools3.utils cimport tostr -cpdef CompressedFile uopen(str name, mode='r'): +cpdef CompressedFile uopen(object name, mode='rb'): cdef CompressedFile c try: - f = urlopen(name) + f = urlopen(tostr(name)) except: - f = open(name,mode) + f = open(tostr(name),mode) c = CompressedFile(f) diff --git a/python/obitools3/parsers/fasta.pyx b/python/obitools3/parsers/fasta.pyx index 0a7b6d1..fd169d9 100644 --- a/python/obitools3/parsers/fasta.pyx +++ b/python/obitools3/parsers/fasta.pyx @@ -11,89 +11,82 @@ import types from obitools3.dms.obiseq cimport Nuc_Seq -def fastaIterator(lineiterator, - int skip=0, - only=None, - firstline=None, - int buffersize=100000000 - ): - cdef str ident - cdef str definition - cdef dict tags - cdef list s - cdef bytes sequence - cdef int skipped, ionly, read -# cdef OBI_Seq seq - - if only is None: - ionly=-1 - else: - ionly=int(only) - - if isinstance(lineiterator, (str, bytes)): - lineiterator=uopen(lineiterator) - if isinstance(lineiterator, LineBuffer): - iterator = iter(lineiterator) - else: - if hasattr(lineiterator, "readlines"): - iterator = iter(LineBuffer(lineiterator, buffersize)) - elif hasattr(lineiterator, '__next__'): - iterator = lineiterator - else: - raise Exception("Invalid line iterator") - - skipped = 0 - i = iterator - - if firstline is None: - line = next(i) - else: - line = firstline - - while True: - - if ionly >= 0 and read >= ionly: - break - - while skipped < skip : - line = next(i) - try: - while line[0]!='>': - line = next(i) - except StopIteration: - pass - skipped += 1 - - ident,tags,definition = parseHeader(line) - s = [] - line = next(i) - - try: - while line[0]!='>': - s.append(str2bytes(line)[0:-1]) - line = next(i) - - except StopIteration: - pass - - sequence = b"".join(s) - - # seq = OBI_Seq(id, - # sequence, - # definition, - # tags=tags, - # ) - # TODO Seq object - yield { "id" : ident, - "definition" : definition, - "sequence" : sequence, - "quality" : None, - "offset" : None, - "tags" : tags, - "annotation" : {} - } - - read+=1 +# def fastaIterator(lineiterator, +# int skip=0, +# only=None, +# firstline=None, +# int buffersize=100000000 +# ): +# cdef str ident +# cdef str definition +# cdef dict tags +# cdef list s +# cdef bytes sequence +# cdef int skipped, ionly, read +# +# if only is None: +# ionly=-1 +# else: +# ionly=int(only) +# +# if isinstance(lineiterator, (str, bytes)): +# lineiterator=uopen(lineiterator) +# if isinstance(lineiterator, LineBuffer): +# iterator = iter(lineiterator) +# else: +# if hasattr(lineiterator, "readlines"): +# iterator = iter(LineBuffer(lineiterator, buffersize)) +# elif hasattr(lineiterator, '__next__'): +# iterator = lineiterator +# else: +# raise Exception("Invalid line iterator") +# +# skipped = 0 +# i = iterator +# +# if firstline is None: +# line = next(i) +# else: +# line = firstline +# +# while True: +# +# if ionly >= 0 and read >= ionly: +# break +# +# while skipped < skip : +# line = next(i) +# try: +# while line[0]!='>': +# line = next(i) +# except StopIteration: +# pass +# skipped += 1 +# +# ident,tags,definition = parseHeader(line) +# s = [] +# line = next(i) +# +# try: +# while line[0]!='>': +# s.append(str2bytes(line)[0:-1]) +# line = next(i) +# +# except StopIteration: +# pass +# +# sequence = b"".join(s) +# +# yield { "id" : ident, +# "definition" : definition, +# "sequence" : sequence, +# "quality" : None, +# "offset" : None, +# "tags" : tags, +# "annotation" : {} +# } +# +# read+=1 def fastaNucIterator(lineiterator, @@ -102,8 +95,9 @@ def fastaNucIterator(lineiterator, firstline=None, int buffersize=100000000 ): - cdef str ident - cdef str definition + + cdef bytes ident + cdef bytes definition cdef dict tags cdef list s cdef bytes sequence @@ -143,7 +137,7 @@ def fastaNucIterator(lineiterator, while skipped < skip : line = next(iterator) try: - while line[0]!='>': + while line[:1]!=b'>': line = next(iterator) except StopIteration: pass @@ -154,8 +148,8 @@ def fastaNucIterator(lineiterator, line = next(iterator) try: - while line[0]!='>': - s.append(str2bytes(line)[0:-1]) + while line[:1]!=b'>': + s.append(line[0:-1]) line = next(iterator) except StopIteration: pass @@ -171,17 +165,6 @@ def fastaNucIterator(lineiterator, yield seq -# yield { "id" : ident, -# "definition" : definition, -# "sequence" : sequence, -# "quality" : None, -# "offset" : None, -# "tags" : tags, -# "annotation" : {} -# } - read+=1 - - \ No newline at end of file diff --git a/python/obitools3/parsers/fastq.pyx b/python/obitools3/parsers/fastq.pyx index af88b9b..c92f231 100644 --- a/python/obitools3/parsers/fastq.pyx +++ b/python/obitools3/parsers/fastq.pyx @@ -28,7 +28,8 @@ def fastqIterator(lineiterator, offset, firstline, buffersize) - + + def fastqWithQualityIterator(lineiterator, int skip=0, only=None, @@ -36,14 +37,14 @@ def fastqWithQualityIterator(lineiterator, firstline=None, int buffersize=100000000 ): + cdef LineBuffer lb - cdef str ident - cdef str definition + cdef bytes ident + cdef bytes definition cdef dict tags cdef bytes sequence cdef bytes quality - cdef int skipped, lines_to_skip, ionly, read - cdef int j + cdef int skipped, lines_to_skip, ionly, read, j if only is None: ionly=-1 @@ -84,9 +85,9 @@ def fastqWithQualityIterator(lineiterator, break ident,tags,definition = parseHeader(hline) - sequence = str2bytes(line[0:-1]) + sequence = line[0:-1] next(i) - quality = str2bytes(next(i)[0:-1]) + quality = next(i)[0:-1] seq = Nuc_Seq(ident, sequence, @@ -97,15 +98,6 @@ def fastqWithQualityIterator(lineiterator, yield seq -# yield { "id" : ident, -# "definition" : definition, -# "sequence" : sequence, -# "quality" : quality, -# "offset" : offset, -# "tags" : tags, -# "annotation" : {} -# } - read+=1 hline = next(i) @@ -116,8 +108,8 @@ def fastqWithoutQualityIterator(lineiterator, firstline=None, int buffersize=100000000 ): - cdef str ident - cdef str definition + cdef bytes ident + cdef bytes definition cdef dict tags cdef bytes sequence cdef bytes quality @@ -163,7 +155,7 @@ def fastqWithoutQualityIterator(lineiterator, break ident,tags,definition = parseHeader(hline) - sequence = str2bytes(line[0:-1]) + sequence = line[0:-1] next(i) next(i) @@ -175,15 +167,6 @@ def fastqWithoutQualityIterator(lineiterator, tags=tags) yield seq - -# yield { "id" : ident, -# "definition" : definition, -# "sequence" : sequence, -# "quality" : None, -# "offset" : None, -# "tags" : tags, -# "annotation" : {} -# } read+=1 hline = next(i) diff --git a/python/obitools3/parsers/header.pxd b/python/obitools3/parsers/header.pxd index ffcd3cf..423c4d9 100644 --- a/python/obitools3/parsers/header.pxd +++ b/python/obitools3/parsers/header.pxd @@ -1,4 +1,4 @@ #cython: language_level=3 -cpdef tuple parseHeader(str header) +cpdef tuple parseHeader(bytes header) diff --git a/python/obitools3/parsers/header.pyx b/python/obitools3/parsers/header.pyx index 67fd0a6..cb0261a 100644 --- a/python/obitools3/parsers/header.pyx +++ b/python/obitools3/parsers/header.pyx @@ -10,25 +10,25 @@ from obitools3.utils cimport __etag__ import re -__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''') +__ret__ = re.compile(b'''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''') -cpdef tuple parseHeader(str header): - cdef list m - cdef dict tags - cdef str definition - cdef str ident - cdef str second +cpdef tuple parseHeader(bytes header): + cdef list m + cdef dict tags + cdef bytes definition + cdef bytes ident + cdef bytes second m=header[1:-1].split(maxsplit=1) ident=m[0] - if ident[-1] == ';': + if len(ident)>1 and ident[-2:-1] == b';': ident = ident[:-1] if len(m)==1: tags={} - definition='' + definition=b'' else: second=m[1] m = __ret__.findall(second) diff --git a/python/obitools3/parsers/ngsfilter.pyx b/python/obitools3/parsers/ngsfilter.pyx index bdaea16..ce841b1 100644 --- a/python/obitools3/parsers/ngsfilter.pyx +++ b/python/obitools3/parsers/ngsfilter.pyx @@ -7,7 +7,6 @@ Created on march 8th 2018 ''' from .tab import tabIterator -from obitools3.utils cimport bytes2str import types @@ -24,18 +23,10 @@ def ngsfilterIterator(lineiterator, ): cdef list all_lines - cdef str header - cdef str sep_str + cdef bytes header cdef bytes out_sep - cdef str out_sep_str out_sep = b"\t" - out_sep_str = "\t" - - if sep is not None: - sep_str = bytes2str(sep) - else: - sep_str = None if isinstance(lineiterator, (str, bytes)): lineiterator=uopen(lineiterator) @@ -56,20 +47,20 @@ def ngsfilterIterator(lineiterator, all_lines.insert(0, firstline) # Insert header for column names - column_names = ["experiment", "sample", "forward_tag", "reverse_tag", "forward_primer", "reverse_primer"] - header = out_sep_str.join(column_names) + column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"] + header = out_sep.join(column_names) new_lines.append(header) for line in all_lines: - split_line = line.split(sep_str) + split_line = line.split(sep) tags = split_line.pop(2) - tags = tags.split(":") + tags = tags.split(b":") if len(tags) == 1: # Forward and reverse tags are the same tags.append(tags[0]) split_line.insert(2, tags[0]) split_line.insert(3, tags[1]) - new_lines.append(out_sep_str.join(split_line[0:6])) + new_lines.append(out_sep.join(split_line[0:6])) return tabIterator(iter(new_lines), header = True, diff --git a/python/obitools3/parsers/tab.pyx b/python/obitools3/parsers/tab.pyx index fda8385..a2fcb7d 100644 --- a/python/obitools3/parsers/tab.pyx +++ b/python/obitools3/parsers/tab.pyx @@ -7,7 +7,6 @@ Created on feb 20th 2018 ''' import types -from obitools3.utils cimport bytes2str, tobytes from obitools3.utils cimport __etag__ @@ -28,17 +27,9 @@ def tabIterator(lineiterator, cdef int lines_to_skip, ionly, read cdef list data cdef dict view_line - cdef str sep_str # TODO can't we read file lines as bytes? cdef list keys cdef list key_types - - if sep is not None: - sep_str = bytes2str(sep) - else: - sep_str = None - - commentchar_str = bytes2str(commentchar) - + keys = [] key_types = [] skipped = 0 @@ -68,7 +59,7 @@ def tabIterator(lineiterator, while True: - if (not line.strip() and blanklineskip) or line[0] == commentchar_str: + if (not line.strip() and blanklineskip) or line[:1] == commentchar: line = next(iterator) if ionly >= 0 and read >= ionly: @@ -77,13 +68,13 @@ def tabIterator(lineiterator, if not keys: if header: # TODO read types eventually - keys = line.split(sep_str) - keys = [tobytes(x.strip()) for x in keys] + keys = line.split(sep) + keys = [x.strip() for x in keys] line = next(iterator) continue else: # TODO ??? default column names? like R? - keys = [str(i) for i in range(len(line.split(sep_str)))] + keys = [i for i in range(len(line.split(sep)))] while skipped < skip : line = next(iterator) @@ -92,7 +83,7 @@ def tabIterator(lineiterator, view_line = {} # Parse - data = line.split(sep_str) + data = line.split(sep) if stripwhite or key_types: data = [x.strip() for x in data] diff --git a/python/obitools3/parsers/universal.pyx b/python/obitools3/parsers/universal.pyx index 10f7c37..8e90dd7 100644 --- a/python/obitools3/parsers/universal.pyx +++ b/python/obitools3/parsers/universal.pyx @@ -5,10 +5,11 @@ from obitools3.parsers.fasta import fastaNucIterator from obitools3.parsers.fastq import fastqIterator from obitools3.parsers.tab import tabIterator from obitools3.parsers.ngsfilter import ngsfilterIterator +from obitools3.parsers.embl import emblIterator -oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I) -tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I) +oligore = re.compile(b"^[ACGTRYSWKMBDHVN]+$",re.I) +tagre = re.compile(b"^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I) def is_ngsfilter_line(line): # TODO doesn't work? try: @@ -16,8 +17,8 @@ def is_ngsfilter_line(line): # TODO doesn't work? ok = tagre.match(parts[2]) ok&= oligore.match(parts[3]) ok&= oligore.match(parts[4]) - ok&= parts[5]=="F" | parts[5]=="T" - return ok + ok&= parts[5]==b"F" | parts[5]==b"T" + return ok except: return False @@ -55,19 +56,22 @@ def entryIteratorFactory(lineiterator, format=b"tabular" - if first[0]==">": - format=b"fasta" - if first[0]=="@": - format=b"fastq" - elif first[0:3]=='ID ': - format=b"embl" - elif first[0:6]=='LOCUS ': - format=b"genbank" - elif first[0:11]=='#@ecopcr-v2': # TODO v2???? - format=b"ecopcrfile" - elif is_ngsfilter_line(first): - format=b"ngsfilter" - + try: + if first[:1]==b">": + format=b"fasta" + if first[:1]==b"@": + format=b"fastq" + elif first[0:3]==b'ID ': + format=b"embl" + elif first[0:6]==b'LOCUS ': + format=b"genbank" + elif first[0:11]==b'#@ecopcr-v2': # TODO v2???? + format=b"ecopcrfile" + elif is_ngsfilter_line(first): + format=b"ngsfilter" + except IndexError: + pass + # TODO Temporary fix first=None lineiterator.seek(0) @@ -114,6 +118,14 @@ def entryIteratorFactory(lineiterator, firstline=first, buffersize=buffersize), dict) + + elif format==b'embl': + return (emblIterator(lineiterator, + skip=skip, + only=only, + firstline=first, + buffersize=buffersize), + dict) raise NotImplementedError('File format not yet implemented') diff --git a/python/obitools3/utils.pxd b/python/obitools3/utils.pxd index 4d9849d..0673af4 100644 --- a/python/obitools3/utils.pxd +++ b/python/obitools3/utils.pxd @@ -14,4 +14,4 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value) cdef obitype_t get_obitype_iterable_value(object value) cdef obitype_t get_obitype(object value) -cdef object __etag__(str x) +cdef object __etag__(bytes x) diff --git a/python/obitools3/utils.pyx b/python/obitools3/utils.pyx index 5d0bb83..d75225b 100644 --- a/python/obitools3/utils.pyx +++ b/python/obitools3/utils.pyx @@ -160,10 +160,10 @@ cdef obitype_t get_obitype(object value) : return get_obitype_single_value(value) -__re_int__ = re.compile("^[+-]?[0-9]+$") -__re_float__ = re.compile("^[+-]?[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?$") -__re_str__ = re.compile("""^"[^"]*"|'[^']*'$""") -__re_dict__ = re.compile("""^\{\ * +__re_int__ = re.compile(b"^[+-]?[0-9]+$") +__re_float__ = re.compile(b"^[+-]?[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?$") +__re_str__ = re.compile(b"""^"[^"]*"|'[^']*'$""") +__re_dict__ = re.compile(b"""^\{\ * ( ("[^"]*"|'[^']*') \ *:\ * @@ -181,9 +181,9 @@ __re_dict__ = re.compile("""^\{\ * ) )*\ *\}$""", re.VERBOSE) -__re_val__ = re.compile("""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""") +__re_val__ = re.compile(b"""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""") -cdef object __etag__(str x): +cdef object __etag__(bytes x): cdef list elements cdef tuple i @@ -193,11 +193,11 @@ cdef object __etag__(str x): v=float(x) elif __re_str__.match(x): v=x[1:-1] - elif x=='None': + elif x==b'None': v=None - elif x=='False': + elif x==b'False': v=False - elif x=='True': + elif x==b'True': v=True elif __re_dict__.match(x): elements=__re_val__.findall(x)