Files
obitools3/python/obitools3/parsers/fasta.pyx
Celine Mercier b6b95f26b6 obi import: Skipping sequences is now done through the iterators so that
sequences are not uselessly parsed
2017-07-10 17:02:30 +02:00

143 lines
3.3 KiB
Cython

#cython: language_level=3
'''
Created on 30 mars 2016
@author: coissac
'''
#from obitools3.dms._obiseq cimport OBI_Seq
def fastaIterator(lineiterator,
int buffersize=100000000,
int skip=0
):
cdef LineBuffer lb
cdef str ident
cdef str definition
cdef dict tags
cdef list s
cdef bytes sequence
cdef bytes quality
cdef int skipped
# cdef OBI_Seq seq
if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
lb=lineiterator
else:
lb=LineBuffer(lineiterator,buffersize)
skipped = 0
i = iter(lb)
line = next(i)
while True:
while skipped < skip :
line = next(i)
try:
while line[0]!='>':
line = next(i)
except StopIteration:
pass
skipped += 1
ident,tags,definition = parseHeader(line)
s = []
line = next(i)
try:
while line[0]!='>':
s.append(str2bytes(line)[0:-1])
line = next(i)
except StopIteration:
pass
sequence = b"".join(s)
quality = None
# seq = OBI_Seq(id,
# sequence,
# definition,
# tags=tags,
# )
yield { "id" : ident,
"definition" : definition,
"sequence" : sequence,
"quality" : quality,
"tags" : tags,
"annotation" : {}
}
def fastaNucIterator(lineiterator,
int buffersize=100000000,
int skip=0
):
cdef LineBuffer lb
cdef str ident
cdef str definition
cdef dict tags
cdef list s
cdef bytes sequence
cdef bytes quality
cdef int skipped
# cdef OBI_Seq seq
if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
lb=lineiterator
else:
lb=LineBuffer(lineiterator,buffersize)
skipped = 0
i = iter(lb)
line = next(i)
while True:
while skipped < skip :
line = next(i)
try:
while line[0]!='>':
line = next(i)
except StopIteration:
pass
skipped += 1
ident,tags,definition = parseHeader(line)
s = []
line = next(i)
try:
while line[0]!='>':
s.append(str2bytes(line)[0:-1])
line = next(i)
except StopIteration:
pass
sequence = b"".join(s)
quality = None
# seq =
yield { "id" : ident,
"definition" : definition,
"sequence" : sequence,
"quality" : quality,
"tags" : tags,
"annotation" : {}
}