Cython API: added EMBL parser and files to import are now read in binary
mode
This commit is contained in:
@ -8,13 +8,13 @@ Created on 30 mars 2016
|
||||
|
||||
cdef class LineBuffer:
|
||||
|
||||
def __init__(self,object fileobj,int size=100000000):
|
||||
def __init__(self, object fileobj, int size=100000000):
|
||||
self.fileobj=fileobj
|
||||
self.size=size
|
||||
|
||||
def __iter__(self):
|
||||
cdef list buff = self.fileobj.readlines(self.size)
|
||||
cdef str l
|
||||
cdef object l # Can be str or bytes
|
||||
|
||||
while buff:
|
||||
for l in buff:
|
||||
|
@ -2,4 +2,4 @@
|
||||
|
||||
from .uncompress cimport CompressedFile
|
||||
|
||||
cpdef CompressedFile uopen(str name, mode=?)
|
||||
cpdef CompressedFile uopen(object name, mode=?)
|
@ -7,15 +7,16 @@ Created on 25 mars 2016
|
||||
'''
|
||||
|
||||
from urllib.request import urlopen
|
||||
from obitools3.utils cimport tostr
|
||||
|
||||
|
||||
cpdef CompressedFile uopen(str name, mode='r'):
|
||||
cpdef CompressedFile uopen(object name, mode='rb'):
|
||||
cdef CompressedFile c
|
||||
|
||||
try:
|
||||
f = urlopen(name)
|
||||
f = urlopen(tostr(name))
|
||||
except:
|
||||
f = open(name,mode)
|
||||
f = open(tostr(name),mode)
|
||||
|
||||
c = CompressedFile(f)
|
||||
|
||||
|
@ -11,89 +11,82 @@ import types
|
||||
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||
|
||||
|
||||
def fastaIterator(lineiterator,
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
):
|
||||
cdef str ident
|
||||
cdef str definition
|
||||
cdef dict tags
|
||||
cdef list s
|
||||
cdef bytes sequence
|
||||
cdef int skipped, ionly, read
|
||||
# cdef OBI_Seq seq
|
||||
|
||||
if only is None:
|
||||
ionly=-1
|
||||
else:
|
||||
ionly=int(only)
|
||||
|
||||
if isinstance(lineiterator, (str, bytes)):
|
||||
lineiterator=uopen(lineiterator)
|
||||
if isinstance(lineiterator, LineBuffer):
|
||||
iterator = iter(lineiterator)
|
||||
else:
|
||||
if hasattr(lineiterator, "readlines"):
|
||||
iterator = iter(LineBuffer(lineiterator, buffersize))
|
||||
elif hasattr(lineiterator, '__next__'):
|
||||
iterator = lineiterator
|
||||
else:
|
||||
raise Exception("Invalid line iterator")
|
||||
|
||||
skipped = 0
|
||||
i = iterator
|
||||
|
||||
if firstline is None:
|
||||
line = next(i)
|
||||
else:
|
||||
line = firstline
|
||||
|
||||
while True:
|
||||
|
||||
if ionly >= 0 and read >= ionly:
|
||||
break
|
||||
|
||||
while skipped < skip :
|
||||
line = next(i)
|
||||
try:
|
||||
while line[0]!='>':
|
||||
line = next(i)
|
||||
except StopIteration:
|
||||
pass
|
||||
skipped += 1
|
||||
|
||||
ident,tags,definition = parseHeader(line)
|
||||
s = []
|
||||
line = next(i)
|
||||
|
||||
try:
|
||||
while line[0]!='>':
|
||||
s.append(str2bytes(line)[0:-1])
|
||||
line = next(i)
|
||||
|
||||
except StopIteration:
|
||||
pass
|
||||
|
||||
sequence = b"".join(s)
|
||||
|
||||
# seq = OBI_Seq(id,
|
||||
# sequence,
|
||||
# definition,
|
||||
# tags=tags,
|
||||
# )
|
||||
# TODO Seq object
|
||||
yield { "id" : ident,
|
||||
"definition" : definition,
|
||||
"sequence" : sequence,
|
||||
"quality" : None,
|
||||
"offset" : None,
|
||||
"tags" : tags,
|
||||
"annotation" : {}
|
||||
}
|
||||
|
||||
read+=1
|
||||
# def fastaIterator(lineiterator,
|
||||
# int skip=0,
|
||||
# only=None,
|
||||
# firstline=None,
|
||||
# int buffersize=100000000
|
||||
# ):
|
||||
# cdef str ident
|
||||
# cdef str definition
|
||||
# cdef dict tags
|
||||
# cdef list s
|
||||
# cdef bytes sequence
|
||||
# cdef int skipped, ionly, read
|
||||
#
|
||||
# if only is None:
|
||||
# ionly=-1
|
||||
# else:
|
||||
# ionly=int(only)
|
||||
#
|
||||
# if isinstance(lineiterator, (str, bytes)):
|
||||
# lineiterator=uopen(lineiterator)
|
||||
# if isinstance(lineiterator, LineBuffer):
|
||||
# iterator = iter(lineiterator)
|
||||
# else:
|
||||
# if hasattr(lineiterator, "readlines"):
|
||||
# iterator = iter(LineBuffer(lineiterator, buffersize))
|
||||
# elif hasattr(lineiterator, '__next__'):
|
||||
# iterator = lineiterator
|
||||
# else:
|
||||
# raise Exception("Invalid line iterator")
|
||||
#
|
||||
# skipped = 0
|
||||
# i = iterator
|
||||
#
|
||||
# if firstline is None:
|
||||
# line = next(i)
|
||||
# else:
|
||||
# line = firstline
|
||||
#
|
||||
# while True:
|
||||
#
|
||||
# if ionly >= 0 and read >= ionly:
|
||||
# break
|
||||
#
|
||||
# while skipped < skip :
|
||||
# line = next(i)
|
||||
# try:
|
||||
# while line[0]!='>':
|
||||
# line = next(i)
|
||||
# except StopIteration:
|
||||
# pass
|
||||
# skipped += 1
|
||||
#
|
||||
# ident,tags,definition = parseHeader(line)
|
||||
# s = []
|
||||
# line = next(i)
|
||||
#
|
||||
# try:
|
||||
# while line[0]!='>':
|
||||
# s.append(str2bytes(line)[0:-1])
|
||||
# line = next(i)
|
||||
#
|
||||
# except StopIteration:
|
||||
# pass
|
||||
#
|
||||
# sequence = b"".join(s)
|
||||
#
|
||||
# yield { "id" : ident,
|
||||
# "definition" : definition,
|
||||
# "sequence" : sequence,
|
||||
# "quality" : None,
|
||||
# "offset" : None,
|
||||
# "tags" : tags,
|
||||
# "annotation" : {}
|
||||
# }
|
||||
#
|
||||
# read+=1
|
||||
|
||||
|
||||
def fastaNucIterator(lineiterator,
|
||||
@ -102,8 +95,9 @@ def fastaNucIterator(lineiterator,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
):
|
||||
cdef str ident
|
||||
cdef str definition
|
||||
|
||||
cdef bytes ident
|
||||
cdef bytes definition
|
||||
cdef dict tags
|
||||
cdef list s
|
||||
cdef bytes sequence
|
||||
@ -143,7 +137,7 @@ def fastaNucIterator(lineiterator,
|
||||
while skipped < skip :
|
||||
line = next(iterator)
|
||||
try:
|
||||
while line[0]!='>':
|
||||
while line[:1]!=b'>':
|
||||
line = next(iterator)
|
||||
except StopIteration:
|
||||
pass
|
||||
@ -154,8 +148,8 @@ def fastaNucIterator(lineiterator,
|
||||
line = next(iterator)
|
||||
|
||||
try:
|
||||
while line[0]!='>':
|
||||
s.append(str2bytes(line)[0:-1])
|
||||
while line[:1]!=b'>':
|
||||
s.append(line[0:-1])
|
||||
line = next(iterator)
|
||||
except StopIteration:
|
||||
pass
|
||||
@ -171,17 +165,6 @@ def fastaNucIterator(lineiterator,
|
||||
|
||||
yield seq
|
||||
|
||||
# yield { "id" : ident,
|
||||
# "definition" : definition,
|
||||
# "sequence" : sequence,
|
||||
# "quality" : None,
|
||||
# "offset" : None,
|
||||
# "tags" : tags,
|
||||
# "annotation" : {}
|
||||
# }
|
||||
|
||||
read+=1
|
||||
|
||||
|
||||
|
||||
|
@ -29,6 +29,7 @@ def fastqIterator(lineiterator,
|
||||
firstline,
|
||||
buffersize)
|
||||
|
||||
|
||||
def fastqWithQualityIterator(lineiterator,
|
||||
int skip=0,
|
||||
only=None,
|
||||
@ -36,14 +37,14 @@ def fastqWithQualityIterator(lineiterator,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
):
|
||||
|
||||
cdef LineBuffer lb
|
||||
cdef str ident
|
||||
cdef str definition
|
||||
cdef bytes ident
|
||||
cdef bytes definition
|
||||
cdef dict tags
|
||||
cdef bytes sequence
|
||||
cdef bytes quality
|
||||
cdef int skipped, lines_to_skip, ionly, read
|
||||
cdef int j
|
||||
cdef int skipped, lines_to_skip, ionly, read, j
|
||||
|
||||
if only is None:
|
||||
ionly=-1
|
||||
@ -84,9 +85,9 @@ def fastqWithQualityIterator(lineiterator,
|
||||
break
|
||||
|
||||
ident,tags,definition = parseHeader(hline)
|
||||
sequence = str2bytes(line[0:-1])
|
||||
sequence = line[0:-1]
|
||||
next(i)
|
||||
quality = str2bytes(next(i)[0:-1])
|
||||
quality = next(i)[0:-1]
|
||||
|
||||
seq = Nuc_Seq(ident,
|
||||
sequence,
|
||||
@ -97,15 +98,6 @@ def fastqWithQualityIterator(lineiterator,
|
||||
|
||||
yield seq
|
||||
|
||||
# yield { "id" : ident,
|
||||
# "definition" : definition,
|
||||
# "sequence" : sequence,
|
||||
# "quality" : quality,
|
||||
# "offset" : offset,
|
||||
# "tags" : tags,
|
||||
# "annotation" : {}
|
||||
# }
|
||||
|
||||
read+=1
|
||||
hline = next(i)
|
||||
|
||||
@ -116,8 +108,8 @@ def fastqWithoutQualityIterator(lineiterator,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
):
|
||||
cdef str ident
|
||||
cdef str definition
|
||||
cdef bytes ident
|
||||
cdef bytes definition
|
||||
cdef dict tags
|
||||
cdef bytes sequence
|
||||
cdef bytes quality
|
||||
@ -163,7 +155,7 @@ def fastqWithoutQualityIterator(lineiterator,
|
||||
break
|
||||
|
||||
ident,tags,definition = parseHeader(hline)
|
||||
sequence = str2bytes(line[0:-1])
|
||||
sequence = line[0:-1]
|
||||
next(i)
|
||||
next(i)
|
||||
|
||||
@ -176,15 +168,6 @@ def fastqWithoutQualityIterator(lineiterator,
|
||||
|
||||
yield seq
|
||||
|
||||
# yield { "id" : ident,
|
||||
# "definition" : definition,
|
||||
# "sequence" : sequence,
|
||||
# "quality" : None,
|
||||
# "offset" : None,
|
||||
# "tags" : tags,
|
||||
# "annotation" : {}
|
||||
# }
|
||||
|
||||
read+=1
|
||||
hline = next(i)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#cython: language_level=3
|
||||
|
||||
|
||||
cpdef tuple parseHeader(str header)
|
||||
cpdef tuple parseHeader(bytes header)
|
||||
|
@ -10,25 +10,25 @@ from obitools3.utils cimport __etag__
|
||||
import re
|
||||
|
||||
|
||||
__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
|
||||
__ret__ = re.compile(b'''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
|
||||
|
||||
|
||||
cpdef tuple parseHeader(str header):
|
||||
cdef list m
|
||||
cdef dict tags
|
||||
cdef str definition
|
||||
cdef str ident
|
||||
cdef str second
|
||||
cpdef tuple parseHeader(bytes header):
|
||||
cdef list m
|
||||
cdef dict tags
|
||||
cdef bytes definition
|
||||
cdef bytes ident
|
||||
cdef bytes second
|
||||
|
||||
m=header[1:-1].split(maxsplit=1)
|
||||
|
||||
ident=m[0]
|
||||
if ident[-1] == ';':
|
||||
if len(ident)>1 and ident[-2:-1] == b';':
|
||||
ident = ident[:-1]
|
||||
|
||||
if len(m)==1:
|
||||
tags={}
|
||||
definition=''
|
||||
definition=b''
|
||||
else:
|
||||
second=m[1]
|
||||
m = __ret__.findall(second)
|
||||
|
@ -7,7 +7,6 @@ Created on march 8th 2018
|
||||
'''
|
||||
|
||||
from .tab import tabIterator
|
||||
from obitools3.utils cimport bytes2str
|
||||
import types
|
||||
|
||||
|
||||
@ -24,18 +23,10 @@ def ngsfilterIterator(lineiterator,
|
||||
):
|
||||
|
||||
cdef list all_lines
|
||||
cdef str header
|
||||
cdef str sep_str
|
||||
cdef bytes header
|
||||
cdef bytes out_sep
|
||||
cdef str out_sep_str
|
||||
|
||||
out_sep = b"\t"
|
||||
out_sep_str = "\t"
|
||||
|
||||
if sep is not None:
|
||||
sep_str = bytes2str(sep)
|
||||
else:
|
||||
sep_str = None
|
||||
|
||||
if isinstance(lineiterator, (str, bytes)):
|
||||
lineiterator=uopen(lineiterator)
|
||||
@ -56,20 +47,20 @@ def ngsfilterIterator(lineiterator,
|
||||
all_lines.insert(0, firstline)
|
||||
|
||||
# Insert header for column names
|
||||
column_names = ["experiment", "sample", "forward_tag", "reverse_tag", "forward_primer", "reverse_primer"]
|
||||
header = out_sep_str.join(column_names)
|
||||
column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"]
|
||||
header = out_sep.join(column_names)
|
||||
|
||||
new_lines.append(header)
|
||||
|
||||
for line in all_lines:
|
||||
split_line = line.split(sep_str)
|
||||
split_line = line.split(sep)
|
||||
tags = split_line.pop(2)
|
||||
tags = tags.split(":")
|
||||
tags = tags.split(b":")
|
||||
if len(tags) == 1: # Forward and reverse tags are the same
|
||||
tags.append(tags[0])
|
||||
split_line.insert(2, tags[0])
|
||||
split_line.insert(3, tags[1])
|
||||
new_lines.append(out_sep_str.join(split_line[0:6]))
|
||||
new_lines.append(out_sep.join(split_line[0:6]))
|
||||
|
||||
return tabIterator(iter(new_lines),
|
||||
header = True,
|
||||
|
@ -7,7 +7,6 @@ Created on feb 20th 2018
|
||||
'''
|
||||
|
||||
import types
|
||||
from obitools3.utils cimport bytes2str, tobytes
|
||||
from obitools3.utils cimport __etag__
|
||||
|
||||
|
||||
@ -28,17 +27,9 @@ def tabIterator(lineiterator,
|
||||
cdef int lines_to_skip, ionly, read
|
||||
cdef list data
|
||||
cdef dict view_line
|
||||
cdef str sep_str # TODO can't we read file lines as bytes?
|
||||
cdef list keys
|
||||
cdef list key_types
|
||||
|
||||
if sep is not None:
|
||||
sep_str = bytes2str(sep)
|
||||
else:
|
||||
sep_str = None
|
||||
|
||||
commentchar_str = bytes2str(commentchar)
|
||||
|
||||
keys = []
|
||||
key_types = []
|
||||
skipped = 0
|
||||
@ -68,7 +59,7 @@ def tabIterator(lineiterator,
|
||||
|
||||
while True:
|
||||
|
||||
if (not line.strip() and blanklineskip) or line[0] == commentchar_str:
|
||||
if (not line.strip() and blanklineskip) or line[:1] == commentchar:
|
||||
line = next(iterator)
|
||||
|
||||
if ionly >= 0 and read >= ionly:
|
||||
@ -77,13 +68,13 @@ def tabIterator(lineiterator,
|
||||
if not keys:
|
||||
if header:
|
||||
# TODO read types eventually
|
||||
keys = line.split(sep_str)
|
||||
keys = [tobytes(x.strip()) for x in keys]
|
||||
keys = line.split(sep)
|
||||
keys = [x.strip() for x in keys]
|
||||
line = next(iterator)
|
||||
continue
|
||||
else:
|
||||
# TODO ??? default column names? like R?
|
||||
keys = [str(i) for i in range(len(line.split(sep_str)))]
|
||||
keys = [i for i in range(len(line.split(sep)))]
|
||||
|
||||
while skipped < skip :
|
||||
line = next(iterator)
|
||||
@ -92,7 +83,7 @@ def tabIterator(lineiterator,
|
||||
view_line = {}
|
||||
|
||||
# Parse
|
||||
data = line.split(sep_str)
|
||||
data = line.split(sep)
|
||||
|
||||
if stripwhite or key_types:
|
||||
data = [x.strip() for x in data]
|
||||
|
@ -5,10 +5,11 @@ from obitools3.parsers.fasta import fastaNucIterator
|
||||
from obitools3.parsers.fastq import fastqIterator
|
||||
from obitools3.parsers.tab import tabIterator
|
||||
from obitools3.parsers.ngsfilter import ngsfilterIterator
|
||||
from obitools3.parsers.embl import emblIterator
|
||||
|
||||
|
||||
oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I)
|
||||
tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
|
||||
oligore = re.compile(b"^[ACGTRYSWKMBDHVN]+$",re.I)
|
||||
tagre = re.compile(b"^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
|
||||
|
||||
def is_ngsfilter_line(line): # TODO doesn't work?
|
||||
try:
|
||||
@ -16,7 +17,7 @@ def is_ngsfilter_line(line): # TODO doesn't work?
|
||||
ok = tagre.match(parts[2])
|
||||
ok&= oligore.match(parts[3])
|
||||
ok&= oligore.match(parts[4])
|
||||
ok&= parts[5]=="F" | parts[5]=="T"
|
||||
ok&= parts[5]==b"F" | parts[5]==b"T"
|
||||
return ok
|
||||
except:
|
||||
return False
|
||||
@ -55,18 +56,21 @@ def entryIteratorFactory(lineiterator,
|
||||
|
||||
format=b"tabular"
|
||||
|
||||
if first[0]==">":
|
||||
format=b"fasta"
|
||||
if first[0]=="@":
|
||||
format=b"fastq"
|
||||
elif first[0:3]=='ID ':
|
||||
format=b"embl"
|
||||
elif first[0:6]=='LOCUS ':
|
||||
format=b"genbank"
|
||||
elif first[0:11]=='#@ecopcr-v2': # TODO v2????
|
||||
format=b"ecopcrfile"
|
||||
elif is_ngsfilter_line(first):
|
||||
format=b"ngsfilter"
|
||||
try:
|
||||
if first[:1]==b">":
|
||||
format=b"fasta"
|
||||
if first[:1]==b"@":
|
||||
format=b"fastq"
|
||||
elif first[0:3]==b'ID ':
|
||||
format=b"embl"
|
||||
elif first[0:6]==b'LOCUS ':
|
||||
format=b"genbank"
|
||||
elif first[0:11]==b'#@ecopcr-v2': # TODO v2????
|
||||
format=b"ecopcrfile"
|
||||
elif is_ngsfilter_line(first):
|
||||
format=b"ngsfilter"
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
# TODO Temporary fix
|
||||
first=None
|
||||
@ -115,5 +119,13 @@ def entryIteratorFactory(lineiterator,
|
||||
buffersize=buffersize),
|
||||
dict)
|
||||
|
||||
elif format==b'embl':
|
||||
return (emblIterator(lineiterator,
|
||||
skip=skip,
|
||||
only=only,
|
||||
firstline=first,
|
||||
buffersize=buffersize),
|
||||
dict)
|
||||
|
||||
raise NotImplementedError('File format not yet implemented')
|
||||
|
||||
|
@ -14,4 +14,4 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value)
|
||||
cdef obitype_t get_obitype_iterable_value(object value)
|
||||
cdef obitype_t get_obitype(object value)
|
||||
|
||||
cdef object __etag__(str x)
|
||||
cdef object __etag__(bytes x)
|
||||
|
@ -160,10 +160,10 @@ cdef obitype_t get_obitype(object value) :
|
||||
return get_obitype_single_value(value)
|
||||
|
||||
|
||||
__re_int__ = re.compile("^[+-]?[0-9]+$")
|
||||
__re_float__ = re.compile("^[+-]?[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?$")
|
||||
__re_str__ = re.compile("""^"[^"]*"|'[^']*'$""")
|
||||
__re_dict__ = re.compile("""^\{\ *
|
||||
__re_int__ = re.compile(b"^[+-]?[0-9]+$")
|
||||
__re_float__ = re.compile(b"^[+-]?[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?$")
|
||||
__re_str__ = re.compile(b"""^"[^"]*"|'[^']*'$""")
|
||||
__re_dict__ = re.compile(b"""^\{\ *
|
||||
(
|
||||
("[^"]*"|'[^']*')
|
||||
\ *:\ *
|
||||
@ -181,9 +181,9 @@ __re_dict__ = re.compile("""^\{\ *
|
||||
)
|
||||
)*\ *\}$""", re.VERBOSE)
|
||||
|
||||
__re_val__ = re.compile("""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""")
|
||||
__re_val__ = re.compile(b"""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""")
|
||||
|
||||
cdef object __etag__(str x):
|
||||
cdef object __etag__(bytes x):
|
||||
cdef list elements
|
||||
cdef tuple i
|
||||
|
||||
@ -193,11 +193,11 @@ cdef object __etag__(str x):
|
||||
v=float(x)
|
||||
elif __re_str__.match(x):
|
||||
v=x[1:-1]
|
||||
elif x=='None':
|
||||
elif x==b'None':
|
||||
v=None
|
||||
elif x=='False':
|
||||
elif x==b'False':
|
||||
v=False
|
||||
elif x=='True':
|
||||
elif x==b'True':
|
||||
v=True
|
||||
elif __re_dict__.match(x):
|
||||
elements=__re_val__.findall(x)
|
||||
|
Reference in New Issue
Block a user