Cython API: obi import can now import ngsfilter files and tabular files

This commit is contained in:
Celine Mercier
2018-03-12 18:10:43 +01:00
parent 8a0b95c1d6
commit 15e43bb9a1
9 changed files with 168 additions and 142 deletions

View File

@ -17,7 +17,6 @@ def fastaIterator(lineiterator,
firstline=None,
int buffersize=100000000
):
cdef LineBuffer lb
cdef str ident
cdef str definition
cdef dict tags
@ -31,23 +30,26 @@ def fastaIterator(lineiterator,
else:
ionly=int(only)
if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
lb=lineiterator
iterator = iter(lineiterator)
else:
lb=LineBuffer(lineiterator,buffersize)
if hasattr(lineiterator, "readlines"):
iterator = iter(LineBuffer(lineiterator, buffersize))
elif hasattr(lineiterator, '__next__'):
iterator = lineiterator
else:
raise Exception("Invalid line iterator")
skipped = 0
i = iter(lb)
i = iterator
if firstline is None:
line = next(i)
else:
line = firstline
while True:
if ionly >= 0 and read >= ionly:
@ -81,7 +83,7 @@ def fastaIterator(lineiterator,
# definition,
# tags=tags,
# )
# TODO
# TODO Seq object
yield { "id" : ident,
"definition" : definition,
"sequence" : sequence,
@ -100,7 +102,6 @@ def fastaNucIterator(lineiterator,
firstline=None,
int buffersize=100000000
):
cdef LineBuffer lb
cdef str ident
cdef str definition
cdef dict tags
@ -115,14 +116,16 @@ def fastaNucIterator(lineiterator,
ionly = int(only)
if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, types.GeneratorType):
iterator = lineiterator
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
iterator = iter(lineiterator)
else:
iterator = iter(LineBuffer(lineiterator, buffersize))
if hasattr(lineiterator, "readlines"):
iterator = iter(LineBuffer(lineiterator, buffersize))
elif hasattr(lineiterator, '__next__'):
iterator = lineiterator
else:
raise Exception("Invalid line iterator")
skipped = 0
read = 0

View File

@ -12,7 +12,7 @@ from obitools3.dms.obiseq cimport Nuc_Seq
def fastqIterator(lineiterator,
int skip=0,
only=None,
int qualityoffset=-1,
int offset=-1,
bint noquality=False,
firstline=None,
int buffersize=100000000
@ -25,14 +25,14 @@ def fastqIterator(lineiterator,
else:
return fastqWithQualityIterator(lineiterator,
skip,only,
qualityoffset,
offset,
firstline,
buffersize)
def fastqWithQualityIterator(lineiterator,
int skip=0,
only=None,
int qualityoffset=-1,
int offset=-1,
firstline=None,
int buffersize=100000000
):
@ -49,21 +49,25 @@ def fastqWithQualityIterator(lineiterator,
ionly=-1
else:
ionly=int(only)
if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
lb=lineiterator
iterator = iter(lineiterator)
else:
lb=LineBuffer(lineiterator,buffersize)
i = iter(lb)
if hasattr(lineiterator, "readlines"):
iterator = iter(LineBuffer(lineiterator, buffersize))
elif hasattr(lineiterator, '__next__'):
iterator = lineiterator
else:
raise Exception("Invalid line iterator")
i = iterator
lines_to_skip = skip*4 - (firstline is not None)
for skipped in range(lines_to_skip):
next(i)
if skip > 0:
firstline=None
@ -88,7 +92,7 @@ def fastqWithQualityIterator(lineiterator,
sequence,
definition=definition,
quality=quality,
offset=qualityoffset,
offset=offset,
tags=tags)
yield seq
@ -97,7 +101,7 @@ def fastqWithQualityIterator(lineiterator,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : quality,
# "offset" : qualityoffset,
# "offset" : offset,
# "tags" : tags,
# "annotation" : {}
# }
@ -112,7 +116,6 @@ def fastqWithoutQualityIterator(lineiterator,
firstline=None,
int buffersize=100000000
):
cdef LineBuffer lb
cdef str ident
cdef str definition
cdef dict tags
@ -126,15 +129,19 @@ def fastqWithoutQualityIterator(lineiterator,
else:
ionly=int(only)
if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
lb=lineiterator
iterator = iter(lineiterator)
else:
lb=LineBuffer(lineiterator,buffersize)
if hasattr(lineiterator, "readlines"):
iterator = iter(LineBuffer(lineiterator, buffersize))
elif hasattr(lineiterator, '__next__'):
iterator = lineiterator
else:
raise Exception("Invalid line iterator")
i = iter(lb)
i = iterator
lines_to_skip = skip*4 - (firstline is not None)
for skipped in range(lines_to_skip):

View File

@ -1,5 +1,4 @@
#cython: language_level=3
cdef object __etag__(str x)
cpdef tuple parseHeader(str header)

View File

@ -6,54 +6,12 @@ Created on 25 mars 2016
@author: coissac
'''
from obitools3.utils cimport __etag__
import re
__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
__re_int__ = re.compile("^[+-]?[0-9]+$")
__re_float__ = re.compile("^[+-]?[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?$")
__re_str__ = re.compile("""^"[^"]*"|'[^']*'$""")
__re_dict__ = re.compile("""^\{\ *
(
("[^"]*"|'[^']*')
\ *:\ *
([^,}]+|
"[^"]*"|
'[^']*'
)
)?
(\ *,\ *
("[^"]*"|'[^']*')
\ *:\ *
([^,}]+|
"[^"]*"|
'[^']*'
)
)*\ *\}$""", re.VERBOSE)
__re_val__ = re.compile("""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""")
__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')
cdef object __etag__(str x):
cdef list elements
cdef tuple i
if __re_int__.match(x):
v=int(x)
elif __re_float__.match(x):
v=float(x)
elif __re_str__.match(x):
v=x[1:-1]
elif x=='None':
v=None
elif x=='False':
v=False
elif x=='True':
v=True
elif __re_dict__.match(x):
elements=__re_val__.findall(x)
v=dict([(i[1][1:-1],__etag__(i[2])) for i in elements])
else:
v=x
return v
cpdef tuple parseHeader(str header):
cdef list m

View File

@ -3,12 +3,14 @@
import re
from obitools3.parsers.fasta import fastaNucIterator
from obitools3.parsers.fastq import fastqIterator
from obitools3.parsers.tab import tabIterator
from obitools3.parsers.ngsfilter import ngsfilterIterator
oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I)
tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
def is_ngsfilter_line(line):
def is_ngsfilter_line(line): # TODO doesn't work?
try:
parts = line.split()
ok = tagre.match(parts[2])
@ -23,7 +25,7 @@ def entryIteratorFactory(lineiterator,
int skip=0,
only=None,
bytes seqtype=b'nuc',
int qualityoffset=-1,
int offset=-1,
bint noquality=False,
bint skiperror=True,
bint header=False,
@ -35,15 +37,19 @@ def entryIteratorFactory(lineiterator,
bytes commentchar=b"#",
int buffersize=100000000):
if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
lb=lineiterator
iterator = iter(lineiterator)
else:
lb=LineBuffer(lineiterator, buffersize)
if hasattr(lineiterator, "readlines"):
iterator = iter(LineBuffer(lineiterator, buffersize))
elif hasattr(lineiterator, '__next__'):
iterator = lineiterator
else:
raise Exception("Invalid line iterator")
i = iter(lb)
i = iterator
first=next(i)
@ -57,11 +63,11 @@ def entryIteratorFactory(lineiterator,
format=b"embl"
elif first[0:6]=='LOCUS ':
format=b"genbank"
elif first[0:11]=='#@ecopcr-v2':
elif first[0:11]=='#@ecopcr-v2': # TODO v2????
format=b"ecopcrfile"
elif is_ngsfilter_line(first):
format=b"ngsfilter"
# TODO Temporary fix
first=None
lineiterator.seek(0)
@ -78,12 +84,36 @@ def entryIteratorFactory(lineiterator,
elif format==b'fastq':
return (fastqIterator(lineiterator,
skip=skip,only=only,
qualityoffset=qualityoffset,
offset=offset,
noquality=noquality,
firstline=first,
buffersize=buffersize),
Nuc_Seq)
elif format==b'tabular':
return (tabIterator(lineiterator,
header = header,
sep = sep,
dec = dec,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
skip = skip,
only = only,
firstline=first,
buffersize=buffersize),
dict)
elif format==b'ngsfilter':
return (ngsfilterIterator(lineiterator,
sep = sep,
dec = dec,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
skip = skip,
only = only,
firstline=first,
buffersize=buffersize),
dict)
raise NotImplementedError('File format not yet implemented')