Cython API: added EMBL parser and files to import are now read in binary

mode
This commit is contained in:
Celine Mercier
2018-07-28 16:57:01 +02:00
parent 7f6d1597fc
commit 3e8c187f0b
12 changed files with 161 additions and 200 deletions

View File

@ -8,13 +8,13 @@ Created on 30 mars 2016
cdef class LineBuffer:
def __init__(self,object fileobj,int size=100000000):
def __init__(self, object fileobj, int size=100000000):
self.fileobj=fileobj
self.size=size
def __iter__(self):
cdef list buff = self.fileobj.readlines(self.size)
cdef str l
cdef object l # Can be str or bytes
while buff:
for l in buff:

View File

@ -2,4 +2,4 @@
from .uncompress cimport CompressedFile
cpdef CompressedFile uopen(str name, mode=?)
cpdef CompressedFile uopen(object name, mode=?)

View File

@ -7,15 +7,16 @@ Created on 25 mars 2016
'''
from urllib.request import urlopen
from obitools3.utils cimport tostr
cpdef CompressedFile uopen(str name, mode='r'):
cpdef CompressedFile uopen(object name, mode='rb'):
cdef CompressedFile c
try:
f = urlopen(name)
f = urlopen(tostr(name))
except:
f = open(name,mode)
f = open(tostr(name),mode)
c = CompressedFile(f)

View File

@ -11,89 +11,82 @@ import types
from obitools3.dms.obiseq cimport Nuc_Seq
def fastaIterator(lineiterator,
int skip=0,
only=None,
firstline=None,
int buffersize=100000000
):
cdef str ident
cdef str definition
cdef dict tags
cdef list s
cdef bytes sequence
cdef int skipped, ionly, read
# cdef OBI_Seq seq
if only is None:
ionly=-1
else:
ionly=int(only)
if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
iterator = iter(lineiterator)
else:
if hasattr(lineiterator, "readlines"):
iterator = iter(LineBuffer(lineiterator, buffersize))
elif hasattr(lineiterator, '__next__'):
iterator = lineiterator
else:
raise Exception("Invalid line iterator")
skipped = 0
i = iterator
if firstline is None:
line = next(i)
else:
line = firstline
while True:
if ionly >= 0 and read >= ionly:
break
while skipped < skip :
line = next(i)
try:
while line[0]!='>':
line = next(i)
except StopIteration:
pass
skipped += 1
ident,tags,definition = parseHeader(line)
s = []
line = next(i)
try:
while line[0]!='>':
s.append(str2bytes(line)[0:-1])
line = next(i)
except StopIteration:
pass
sequence = b"".join(s)
# seq = OBI_Seq(id,
# sequence,
# definition,
# tags=tags,
# )
# TODO Seq object
yield { "id" : ident,
"definition" : definition,
"sequence" : sequence,
"quality" : None,
"offset" : None,
"tags" : tags,
"annotation" : {}
}
read+=1
# def fastaIterator(lineiterator,
# int skip=0,
# only=None,
# firstline=None,
# int buffersize=100000000
# ):
# cdef str ident
# cdef str definition
# cdef dict tags
# cdef list s
# cdef bytes sequence
# cdef int skipped, ionly, read
#
# if only is None:
# ionly=-1
# else:
# ionly=int(only)
#
# if isinstance(lineiterator, (str, bytes)):
# lineiterator=uopen(lineiterator)
# if isinstance(lineiterator, LineBuffer):
# iterator = iter(lineiterator)
# else:
# if hasattr(lineiterator, "readlines"):
# iterator = iter(LineBuffer(lineiterator, buffersize))
# elif hasattr(lineiterator, '__next__'):
# iterator = lineiterator
# else:
# raise Exception("Invalid line iterator")
#
# skipped = 0
# i = iterator
#
# if firstline is None:
# line = next(i)
# else:
# line = firstline
#
# while True:
#
# if ionly >= 0 and read >= ionly:
# break
#
# while skipped < skip :
# line = next(i)
# try:
# while line[0]!='>':
# line = next(i)
# except StopIteration:
# pass
# skipped += 1
#
# ident,tags,definition = parseHeader(line)
# s = []
# line = next(i)
#
# try:
# while line[0]!='>':
# s.append(str2bytes(line)[0:-1])
# line = next(i)
#
# except StopIteration:
# pass
#
# sequence = b"".join(s)
#
# yield { "id" : ident,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : None,
# "offset" : None,
# "tags" : tags,
# "annotation" : {}
# }
#
# read+=1
def fastaNucIterator(lineiterator,
@ -102,8 +95,9 @@ def fastaNucIterator(lineiterator,
firstline=None,
int buffersize=100000000
):
cdef str ident
cdef str definition
cdef bytes ident
cdef bytes definition
cdef dict tags
cdef list s
cdef bytes sequence
@ -143,7 +137,7 @@ def fastaNucIterator(lineiterator,
while skipped < skip :
line = next(iterator)
try:
while line[0]!='>':
while line[:1]!=b'>':
line = next(iterator)
except StopIteration:
pass
@ -154,8 +148,8 @@ def fastaNucIterator(lineiterator,
line = next(iterator)
try:
while line[0]!='>':
s.append(str2bytes(line)[0:-1])
while line[:1]!=b'>':
s.append(line[0:-1])
line = next(iterator)
except StopIteration:
pass
@ -171,17 +165,6 @@ def fastaNucIterator(lineiterator,
yield seq
# yield { "id" : ident,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : None,
# "offset" : None,
# "tags" : tags,
# "annotation" : {}
# }
read+=1

View File

@ -29,6 +29,7 @@ def fastqIterator(lineiterator,
firstline,
buffersize)
def fastqWithQualityIterator(lineiterator,
int skip=0,
only=None,
@ -36,14 +37,14 @@ def fastqWithQualityIterator(lineiterator,
firstline=None,
int buffersize=100000000
):
cdef LineBuffer lb
cdef str ident
cdef str definition
cdef bytes ident
cdef bytes definition
cdef dict tags
cdef bytes sequence
cdef bytes quality
cdef int skipped, lines_to_skip, ionly, read
cdef int j
cdef int skipped, lines_to_skip, ionly, read, j
if only is None:
ionly=-1
@ -84,9 +85,9 @@ def fastqWithQualityIterator(lineiterator,
break
ident,tags,definition = parseHeader(hline)
sequence = str2bytes(line[0:-1])
sequence = line[0:-1]
next(i)
quality = str2bytes(next(i)[0:-1])
quality = next(i)[0:-1]
seq = Nuc_Seq(ident,
sequence,
@ -97,15 +98,6 @@ def fastqWithQualityIterator(lineiterator,
yield seq
# yield { "id" : ident,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : quality,
# "offset" : offset,
# "tags" : tags,
# "annotation" : {}
# }
read+=1
hline = next(i)
@ -116,8 +108,8 @@ def fastqWithoutQualityIterator(lineiterator,
firstline=None,
int buffersize=100000000
):
cdef str ident
cdef str definition
cdef bytes ident
cdef bytes definition
cdef dict tags
cdef bytes sequence
cdef bytes quality
@ -163,7 +155,7 @@ def fastqWithoutQualityIterator(lineiterator,
break
ident,tags,definition = parseHeader(hline)
sequence = str2bytes(line[0:-1])
sequence = line[0:-1]
next(i)
next(i)
@ -176,15 +168,6 @@ def fastqWithoutQualityIterator(lineiterator,
yield seq
# yield { "id" : ident,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : None,
# "offset" : None,
# "tags" : tags,
# "annotation" : {}
# }
read+=1
hline = next(i)

View File

@ -1,4 +1,4 @@
#cython: language_level=3
cpdef tuple parseHeader(str header)
cpdef tuple parseHeader(bytes header)

View File

@ -10,25 +10,25 @@ from obitools3.utils cimport __etag__
import re
__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
__ret__ = re.compile(b'''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
cpdef tuple parseHeader(str header):
cpdef tuple parseHeader(bytes header):
cdef list m
cdef dict tags
cdef str definition
cdef str ident
cdef str second
cdef bytes definition
cdef bytes ident
cdef bytes second
m=header[1:-1].split(maxsplit=1)
ident=m[0]
if ident[-1] == ';':
if len(ident)>1 and ident[-2:-1] == b';':
ident = ident[:-1]
if len(m)==1:
tags={}
definition=''
definition=b''
else:
second=m[1]
m = __ret__.findall(second)

View File

@ -7,7 +7,6 @@ Created on march 8th 2018
'''
from .tab import tabIterator
from obitools3.utils cimport bytes2str
import types
@ -24,18 +23,10 @@ def ngsfilterIterator(lineiterator,
):
cdef list all_lines
cdef str header
cdef str sep_str
cdef bytes header
cdef bytes out_sep
cdef str out_sep_str
out_sep = b"\t"
out_sep_str = "\t"
if sep is not None:
sep_str = bytes2str(sep)
else:
sep_str = None
if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator)
@ -56,20 +47,20 @@ def ngsfilterIterator(lineiterator,
all_lines.insert(0, firstline)
# Insert header for column names
column_names = ["experiment", "sample", "forward_tag", "reverse_tag", "forward_primer", "reverse_primer"]
header = out_sep_str.join(column_names)
column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"]
header = out_sep.join(column_names)
new_lines.append(header)
for line in all_lines:
split_line = line.split(sep_str)
split_line = line.split(sep)
tags = split_line.pop(2)
tags = tags.split(":")
tags = tags.split(b":")
if len(tags) == 1: # Forward and reverse tags are the same
tags.append(tags[0])
split_line.insert(2, tags[0])
split_line.insert(3, tags[1])
new_lines.append(out_sep_str.join(split_line[0:6]))
new_lines.append(out_sep.join(split_line[0:6]))
return tabIterator(iter(new_lines),
header = True,

View File

@ -7,7 +7,6 @@ Created on feb 20th 2018
'''
import types
from obitools3.utils cimport bytes2str, tobytes
from obitools3.utils cimport __etag__
@ -28,17 +27,9 @@ def tabIterator(lineiterator,
cdef int lines_to_skip, ionly, read
cdef list data
cdef dict view_line
cdef str sep_str # TODO can't we read file lines as bytes?
cdef list keys
cdef list key_types
if sep is not None:
sep_str = bytes2str(sep)
else:
sep_str = None
commentchar_str = bytes2str(commentchar)
keys = []
key_types = []
skipped = 0
@ -68,7 +59,7 @@ def tabIterator(lineiterator,
while True:
if (not line.strip() and blanklineskip) or line[0] == commentchar_str:
if (not line.strip() and blanklineskip) or line[:1] == commentchar:
line = next(iterator)
if ionly >= 0 and read >= ionly:
@ -77,13 +68,13 @@ def tabIterator(lineiterator,
if not keys:
if header:
# TODO read types eventually
keys = line.split(sep_str)
keys = [tobytes(x.strip()) for x in keys]
keys = line.split(sep)
keys = [x.strip() for x in keys]
line = next(iterator)
continue
else:
# TODO ??? default column names? like R?
keys = [str(i) for i in range(len(line.split(sep_str)))]
keys = [i for i in range(len(line.split(sep)))]
while skipped < skip :
line = next(iterator)
@ -92,7 +83,7 @@ def tabIterator(lineiterator,
view_line = {}
# Parse
data = line.split(sep_str)
data = line.split(sep)
if stripwhite or key_types:
data = [x.strip() for x in data]

View File

@ -5,10 +5,11 @@ from obitools3.parsers.fasta import fastaNucIterator
from obitools3.parsers.fastq import fastqIterator
from obitools3.parsers.tab import tabIterator
from obitools3.parsers.ngsfilter import ngsfilterIterator
from obitools3.parsers.embl import emblIterator
oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I)
tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
oligore = re.compile(b"^[ACGTRYSWKMBDHVN]+$",re.I)
tagre = re.compile(b"^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
def is_ngsfilter_line(line): # TODO doesn't work?
try:
@ -16,7 +17,7 @@ def is_ngsfilter_line(line): # TODO doesn't work?
ok = tagre.match(parts[2])
ok&= oligore.match(parts[3])
ok&= oligore.match(parts[4])
ok&= parts[5]=="F" | parts[5]=="T"
ok&= parts[5]==b"F" | parts[5]==b"T"
return ok
except:
return False
@ -55,18 +56,21 @@ def entryIteratorFactory(lineiterator,
format=b"tabular"
if first[0]==">":
try:
if first[:1]==b">":
format=b"fasta"
if first[0]=="@":
if first[:1]==b"@":
format=b"fastq"
elif first[0:3]=='ID ':
elif first[0:3]==b'ID ':
format=b"embl"
elif first[0:6]=='LOCUS ':
elif first[0:6]==b'LOCUS ':
format=b"genbank"
elif first[0:11]=='#@ecopcr-v2': # TODO v2????
elif first[0:11]==b'#@ecopcr-v2': # TODO v2????
format=b"ecopcrfile"
elif is_ngsfilter_line(first):
format=b"ngsfilter"
except IndexError:
pass
# TODO Temporary fix
first=None
@ -115,5 +119,13 @@ def entryIteratorFactory(lineiterator,
buffersize=buffersize),
dict)
elif format==b'embl':
return (emblIterator(lineiterator,
skip=skip,
only=only,
firstline=first,
buffersize=buffersize),
dict)
raise NotImplementedError('File format not yet implemented')

View File

@ -14,4 +14,4 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value)
cdef obitype_t get_obitype_iterable_value(object value)
cdef obitype_t get_obitype(object value)
cdef object __etag__(str x)
cdef object __etag__(bytes x)

View File

@ -160,10 +160,10 @@ cdef obitype_t get_obitype(object value) :
return get_obitype_single_value(value)
__re_int__ = re.compile("^[+-]?[0-9]+$")
__re_float__ = re.compile("^[+-]?[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?$")
__re_str__ = re.compile("""^"[^"]*"|'[^']*'$""")
__re_dict__ = re.compile("""^\{\ *
__re_int__ = re.compile(b"^[+-]?[0-9]+$")
__re_float__ = re.compile(b"^[+-]?[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?$")
__re_str__ = re.compile(b"""^"[^"]*"|'[^']*'$""")
__re_dict__ = re.compile(b"""^\{\ *
(
("[^"]*"|'[^']*')
\ *:\ *
@ -181,9 +181,9 @@ __re_dict__ = re.compile("""^\{\ *
)
)*\ *\}$""", re.VERBOSE)
__re_val__ = re.compile("""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""")
__re_val__ = re.compile(b"""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""")
cdef object __etag__(str x):
cdef object __etag__(bytes x):
cdef list elements
cdef tuple i
@ -193,11 +193,11 @@ cdef object __etag__(str x):
v=float(x)
elif __re_str__.match(x):
v=x[1:-1]
elif x=='None':
elif x==b'None':
v=None
elif x=='False':
elif x==b'False':
v=False
elif x=='True':
elif x==b'True':
v=True
elif __re_dict__.match(x):
elements=__re_val__.findall(x)