Cython API: added EMBL parser and files to import are now read in binary

mode
This commit is contained in:
Celine Mercier
2018-07-28 16:57:01 +02:00
parent 7f6d1597fc
commit 3e8c187f0b
12 changed files with 161 additions and 200 deletions

View File

@ -8,13 +8,13 @@ Created on 30 mars 2016
cdef class LineBuffer: cdef class LineBuffer:
def __init__(self,object fileobj,int size=100000000): def __init__(self, object fileobj, int size=100000000):
self.fileobj=fileobj self.fileobj=fileobj
self.size=size self.size=size
def __iter__(self): def __iter__(self):
cdef list buff = self.fileobj.readlines(self.size) cdef list buff = self.fileobj.readlines(self.size)
cdef str l cdef object l # Can be str or bytes
while buff: while buff:
for l in buff: for l in buff:

View File

@ -2,4 +2,4 @@
from .uncompress cimport CompressedFile from .uncompress cimport CompressedFile
cpdef CompressedFile uopen(str name, mode=?) cpdef CompressedFile uopen(object name, mode=?)

View File

@ -7,15 +7,16 @@ Created on 25 mars 2016
''' '''
from urllib.request import urlopen from urllib.request import urlopen
from obitools3.utils cimport tostr
cpdef CompressedFile uopen(str name, mode='r'): cpdef CompressedFile uopen(object name, mode='rb'):
cdef CompressedFile c cdef CompressedFile c
try: try:
f = urlopen(name) f = urlopen(tostr(name))
except: except:
f = open(name,mode) f = open(tostr(name),mode)
c = CompressedFile(f) c = CompressedFile(f)

View File

@ -11,89 +11,82 @@ import types
from obitools3.dms.obiseq cimport Nuc_Seq from obitools3.dms.obiseq cimport Nuc_Seq
def fastaIterator(lineiterator, # def fastaIterator(lineiterator,
int skip=0, # int skip=0,
only=None, # only=None,
firstline=None, # firstline=None,
int buffersize=100000000 # int buffersize=100000000
): # ):
cdef str ident # cdef str ident
cdef str definition # cdef str definition
cdef dict tags # cdef dict tags
cdef list s # cdef list s
cdef bytes sequence # cdef bytes sequence
cdef int skipped, ionly, read # cdef int skipped, ionly, read
# cdef OBI_Seq seq #
# if only is None:
if only is None: # ionly=-1
ionly=-1 # else:
else: # ionly=int(only)
ionly=int(only) #
# if isinstance(lineiterator, (str, bytes)):
if isinstance(lineiterator, (str, bytes)): # lineiterator=uopen(lineiterator)
lineiterator=uopen(lineiterator) # if isinstance(lineiterator, LineBuffer):
if isinstance(lineiterator, LineBuffer): # iterator = iter(lineiterator)
iterator = iter(lineiterator) # else:
else: # if hasattr(lineiterator, "readlines"):
if hasattr(lineiterator, "readlines"): # iterator = iter(LineBuffer(lineiterator, buffersize))
iterator = iter(LineBuffer(lineiterator, buffersize)) # elif hasattr(lineiterator, '__next__'):
elif hasattr(lineiterator, '__next__'): # iterator = lineiterator
iterator = lineiterator # else:
else: # raise Exception("Invalid line iterator")
raise Exception("Invalid line iterator") #
# skipped = 0
skipped = 0 # i = iterator
i = iterator #
# if firstline is None:
if firstline is None: # line = next(i)
line = next(i) # else:
else: # line = firstline
line = firstline #
# while True:
while True: #
# if ionly >= 0 and read >= ionly:
if ionly >= 0 and read >= ionly: # break
break #
# while skipped < skip :
while skipped < skip : # line = next(i)
line = next(i) # try:
try: # while line[0]!='>':
while line[0]!='>': # line = next(i)
line = next(i) # except StopIteration:
except StopIteration: # pass
pass # skipped += 1
skipped += 1 #
# ident,tags,definition = parseHeader(line)
ident,tags,definition = parseHeader(line) # s = []
s = [] # line = next(i)
line = next(i) #
# try:
try: # while line[0]!='>':
while line[0]!='>': # s.append(str2bytes(line)[0:-1])
s.append(str2bytes(line)[0:-1]) # line = next(i)
line = next(i) #
# except StopIteration:
except StopIteration: # pass
pass #
# sequence = b"".join(s)
sequence = b"".join(s) #
# yield { "id" : ident,
# seq = OBI_Seq(id, # "definition" : definition,
# sequence, # "sequence" : sequence,
# definition, # "quality" : None,
# tags=tags, # "offset" : None,
# ) # "tags" : tags,
# TODO Seq object # "annotation" : {}
yield { "id" : ident, # }
"definition" : definition, #
"sequence" : sequence, # read+=1
"quality" : None,
"offset" : None,
"tags" : tags,
"annotation" : {}
}
read+=1
def fastaNucIterator(lineiterator, def fastaNucIterator(lineiterator,
@ -102,8 +95,9 @@ def fastaNucIterator(lineiterator,
firstline=None, firstline=None,
int buffersize=100000000 int buffersize=100000000
): ):
cdef str ident
cdef str definition cdef bytes ident
cdef bytes definition
cdef dict tags cdef dict tags
cdef list s cdef list s
cdef bytes sequence cdef bytes sequence
@ -143,7 +137,7 @@ def fastaNucIterator(lineiterator,
while skipped < skip : while skipped < skip :
line = next(iterator) line = next(iterator)
try: try:
while line[0]!='>': while line[:1]!=b'>':
line = next(iterator) line = next(iterator)
except StopIteration: except StopIteration:
pass pass
@ -154,8 +148,8 @@ def fastaNucIterator(lineiterator,
line = next(iterator) line = next(iterator)
try: try:
while line[0]!='>': while line[:1]!=b'>':
s.append(str2bytes(line)[0:-1]) s.append(line[0:-1])
line = next(iterator) line = next(iterator)
except StopIteration: except StopIteration:
pass pass
@ -171,17 +165,6 @@ def fastaNucIterator(lineiterator,
yield seq yield seq
# yield { "id" : ident,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : None,
# "offset" : None,
# "tags" : tags,
# "annotation" : {}
# }
read+=1 read+=1

View File

@ -28,7 +28,8 @@ def fastqIterator(lineiterator,
offset, offset,
firstline, firstline,
buffersize) buffersize)
def fastqWithQualityIterator(lineiterator, def fastqWithQualityIterator(lineiterator,
int skip=0, int skip=0,
only=None, only=None,
@ -36,14 +37,14 @@ def fastqWithQualityIterator(lineiterator,
firstline=None, firstline=None,
int buffersize=100000000 int buffersize=100000000
): ):
cdef LineBuffer lb cdef LineBuffer lb
cdef str ident cdef bytes ident
cdef str definition cdef bytes definition
cdef dict tags cdef dict tags
cdef bytes sequence cdef bytes sequence
cdef bytes quality cdef bytes quality
cdef int skipped, lines_to_skip, ionly, read cdef int skipped, lines_to_skip, ionly, read, j
cdef int j
if only is None: if only is None:
ionly=-1 ionly=-1
@ -84,9 +85,9 @@ def fastqWithQualityIterator(lineiterator,
break break
ident,tags,definition = parseHeader(hline) ident,tags,definition = parseHeader(hline)
sequence = str2bytes(line[0:-1]) sequence = line[0:-1]
next(i) next(i)
quality = str2bytes(next(i)[0:-1]) quality = next(i)[0:-1]
seq = Nuc_Seq(ident, seq = Nuc_Seq(ident,
sequence, sequence,
@ -97,15 +98,6 @@ def fastqWithQualityIterator(lineiterator,
yield seq yield seq
# yield { "id" : ident,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : quality,
# "offset" : offset,
# "tags" : tags,
# "annotation" : {}
# }
read+=1 read+=1
hline = next(i) hline = next(i)
@ -116,8 +108,8 @@ def fastqWithoutQualityIterator(lineiterator,
firstline=None, firstline=None,
int buffersize=100000000 int buffersize=100000000
): ):
cdef str ident cdef bytes ident
cdef str definition cdef bytes definition
cdef dict tags cdef dict tags
cdef bytes sequence cdef bytes sequence
cdef bytes quality cdef bytes quality
@ -163,7 +155,7 @@ def fastqWithoutQualityIterator(lineiterator,
break break
ident,tags,definition = parseHeader(hline) ident,tags,definition = parseHeader(hline)
sequence = str2bytes(line[0:-1]) sequence = line[0:-1]
next(i) next(i)
next(i) next(i)
@ -175,15 +167,6 @@ def fastqWithoutQualityIterator(lineiterator,
tags=tags) tags=tags)
yield seq yield seq
# yield { "id" : ident,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : None,
# "offset" : None,
# "tags" : tags,
# "annotation" : {}
# }
read+=1 read+=1
hline = next(i) hline = next(i)

View File

@ -1,4 +1,4 @@
#cython: language_level=3 #cython: language_level=3
cpdef tuple parseHeader(str header) cpdef tuple parseHeader(bytes header)

View File

@ -10,25 +10,25 @@ from obitools3.utils cimport __etag__
import re import re
__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''') __ret__ = re.compile(b'''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
cpdef tuple parseHeader(str header): cpdef tuple parseHeader(bytes header):
cdef list m cdef list m
cdef dict tags cdef dict tags
cdef str definition cdef bytes definition
cdef str ident cdef bytes ident
cdef str second cdef bytes second
m=header[1:-1].split(maxsplit=1) m=header[1:-1].split(maxsplit=1)
ident=m[0] ident=m[0]
if ident[-1] == ';': if len(ident)>1 and ident[-2:-1] == b';':
ident = ident[:-1] ident = ident[:-1]
if len(m)==1: if len(m)==1:
tags={} tags={}
definition='' definition=b''
else: else:
second=m[1] second=m[1]
m = __ret__.findall(second) m = __ret__.findall(second)

View File

@ -7,7 +7,6 @@ Created on march 8th 2018
''' '''
from .tab import tabIterator from .tab import tabIterator
from obitools3.utils cimport bytes2str
import types import types
@ -24,18 +23,10 @@ def ngsfilterIterator(lineiterator,
): ):
cdef list all_lines cdef list all_lines
cdef str header cdef bytes header
cdef str sep_str
cdef bytes out_sep cdef bytes out_sep
cdef str out_sep_str
out_sep = b"\t" out_sep = b"\t"
out_sep_str = "\t"
if sep is not None:
sep_str = bytes2str(sep)
else:
sep_str = None
if isinstance(lineiterator, (str, bytes)): if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator) lineiterator=uopen(lineiterator)
@ -56,20 +47,20 @@ def ngsfilterIterator(lineiterator,
all_lines.insert(0, firstline) all_lines.insert(0, firstline)
# Insert header for column names # Insert header for column names
column_names = ["experiment", "sample", "forward_tag", "reverse_tag", "forward_primer", "reverse_primer"] column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"]
header = out_sep_str.join(column_names) header = out_sep.join(column_names)
new_lines.append(header) new_lines.append(header)
for line in all_lines: for line in all_lines:
split_line = line.split(sep_str) split_line = line.split(sep)
tags = split_line.pop(2) tags = split_line.pop(2)
tags = tags.split(":") tags = tags.split(b":")
if len(tags) == 1: # Forward and reverse tags are the same if len(tags) == 1: # Forward and reverse tags are the same
tags.append(tags[0]) tags.append(tags[0])
split_line.insert(2, tags[0]) split_line.insert(2, tags[0])
split_line.insert(3, tags[1]) split_line.insert(3, tags[1])
new_lines.append(out_sep_str.join(split_line[0:6])) new_lines.append(out_sep.join(split_line[0:6]))
return tabIterator(iter(new_lines), return tabIterator(iter(new_lines),
header = True, header = True,

View File

@ -7,7 +7,6 @@ Created on feb 20th 2018
''' '''
import types import types
from obitools3.utils cimport bytes2str, tobytes
from obitools3.utils cimport __etag__ from obitools3.utils cimport __etag__
@ -28,17 +27,9 @@ def tabIterator(lineiterator,
cdef int lines_to_skip, ionly, read cdef int lines_to_skip, ionly, read
cdef list data cdef list data
cdef dict view_line cdef dict view_line
cdef str sep_str # TODO can't we read file lines as bytes?
cdef list keys cdef list keys
cdef list key_types cdef list key_types
if sep is not None:
sep_str = bytes2str(sep)
else:
sep_str = None
commentchar_str = bytes2str(commentchar)
keys = [] keys = []
key_types = [] key_types = []
skipped = 0 skipped = 0
@ -68,7 +59,7 @@ def tabIterator(lineiterator,
while True: while True:
if (not line.strip() and blanklineskip) or line[0] == commentchar_str: if (not line.strip() and blanklineskip) or line[:1] == commentchar:
line = next(iterator) line = next(iterator)
if ionly >= 0 and read >= ionly: if ionly >= 0 and read >= ionly:
@ -77,13 +68,13 @@ def tabIterator(lineiterator,
if not keys: if not keys:
if header: if header:
# TODO read types eventually # TODO read types eventually
keys = line.split(sep_str) keys = line.split(sep)
keys = [tobytes(x.strip()) for x in keys] keys = [x.strip() for x in keys]
line = next(iterator) line = next(iterator)
continue continue
else: else:
# TODO ??? default column names? like R? # TODO ??? default column names? like R?
keys = [str(i) for i in range(len(line.split(sep_str)))] keys = [i for i in range(len(line.split(sep)))]
while skipped < skip : while skipped < skip :
line = next(iterator) line = next(iterator)
@ -92,7 +83,7 @@ def tabIterator(lineiterator,
view_line = {} view_line = {}
# Parse # Parse
data = line.split(sep_str) data = line.split(sep)
if stripwhite or key_types: if stripwhite or key_types:
data = [x.strip() for x in data] data = [x.strip() for x in data]

View File

@ -5,10 +5,11 @@ from obitools3.parsers.fasta import fastaNucIterator
from obitools3.parsers.fastq import fastqIterator from obitools3.parsers.fastq import fastqIterator
from obitools3.parsers.tab import tabIterator from obitools3.parsers.tab import tabIterator
from obitools3.parsers.ngsfilter import ngsfilterIterator from obitools3.parsers.ngsfilter import ngsfilterIterator
from obitools3.parsers.embl import emblIterator
oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I) oligore = re.compile(b"^[ACGTRYSWKMBDHVN]+$",re.I)
tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I) tagre = re.compile(b"^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
def is_ngsfilter_line(line): # TODO doesn't work? def is_ngsfilter_line(line): # TODO doesn't work?
try: try:
@ -16,8 +17,8 @@ def is_ngsfilter_line(line): # TODO doesn't work?
ok = tagre.match(parts[2]) ok = tagre.match(parts[2])
ok&= oligore.match(parts[3]) ok&= oligore.match(parts[3])
ok&= oligore.match(parts[4]) ok&= oligore.match(parts[4])
ok&= parts[5]=="F" | parts[5]=="T" ok&= parts[5]==b"F" | parts[5]==b"T"
return ok return ok
except: except:
return False return False
@ -55,19 +56,22 @@ def entryIteratorFactory(lineiterator,
format=b"tabular" format=b"tabular"
if first[0]==">": try:
format=b"fasta" if first[:1]==b">":
if first[0]=="@": format=b"fasta"
format=b"fastq" if first[:1]==b"@":
elif first[0:3]=='ID ': format=b"fastq"
format=b"embl" elif first[0:3]==b'ID ':
elif first[0:6]=='LOCUS ': format=b"embl"
format=b"genbank" elif first[0:6]==b'LOCUS ':
elif first[0:11]=='#@ecopcr-v2': # TODO v2???? format=b"genbank"
format=b"ecopcrfile" elif first[0:11]==b'#@ecopcr-v2': # TODO v2????
elif is_ngsfilter_line(first): format=b"ecopcrfile"
format=b"ngsfilter" elif is_ngsfilter_line(first):
format=b"ngsfilter"
except IndexError:
pass
# TODO Temporary fix # TODO Temporary fix
first=None first=None
lineiterator.seek(0) lineiterator.seek(0)
@ -114,6 +118,14 @@ def entryIteratorFactory(lineiterator,
firstline=first, firstline=first,
buffersize=buffersize), buffersize=buffersize),
dict) dict)
elif format==b'embl':
return (emblIterator(lineiterator,
skip=skip,
only=only,
firstline=first,
buffersize=buffersize),
dict)
raise NotImplementedError('File format not yet implemented') raise NotImplementedError('File format not yet implemented')

View File

@ -14,4 +14,4 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value)
cdef obitype_t get_obitype_iterable_value(object value) cdef obitype_t get_obitype_iterable_value(object value)
cdef obitype_t get_obitype(object value) cdef obitype_t get_obitype(object value)
cdef object __etag__(str x) cdef object __etag__(bytes x)

View File

@ -160,10 +160,10 @@ cdef obitype_t get_obitype(object value) :
return get_obitype_single_value(value) return get_obitype_single_value(value)
__re_int__ = re.compile("^[+-]?[0-9]+$") __re_int__ = re.compile(b"^[+-]?[0-9]+$")
__re_float__ = re.compile("^[+-]?[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?$") __re_float__ = re.compile(b"^[+-]?[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?$")
__re_str__ = re.compile("""^"[^"]*"|'[^']*'$""") __re_str__ = re.compile(b"""^"[^"]*"|'[^']*'$""")
__re_dict__ = re.compile("""^\{\ * __re_dict__ = re.compile(b"""^\{\ *
( (
("[^"]*"|'[^']*') ("[^"]*"|'[^']*')
\ *:\ * \ *:\ *
@ -181,9 +181,9 @@ __re_dict__ = re.compile("""^\{\ *
) )
)*\ *\}$""", re.VERBOSE) )*\ *\}$""", re.VERBOSE)
__re_val__ = re.compile("""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""") __re_val__ = re.compile(b"""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""")
cdef object __etag__(str x): cdef object __etag__(bytes x):
cdef list elements cdef list elements
cdef tuple i cdef tuple i
@ -193,11 +193,11 @@ cdef object __etag__(str x):
v=float(x) v=float(x)
elif __re_str__.match(x): elif __re_str__.match(x):
v=x[1:-1] v=x[1:-1]
elif x=='None': elif x==b'None':
v=None v=None
elif x=='False': elif x==b'False':
v=False v=False
elif x=='True': elif x==b'True':
v=True v=True
elif __re_dict__.match(x): elif __re_dict__.match(x):
elements=__re_val__.findall(x) elements=__re_val__.findall(x)