Complete the fastq iterator to manage new input options

This commit is contained in:
2017-07-27 16:05:17 +02:00
parent 949e5f9baf
commit 8d9cdb4d03

View File

@ -7,17 +7,46 @@ Created on 30 mars 2016
'''
def fastqIterator(lineiterator,
int buffersize=100000000,
int skip=0):
int skip=0,
only=None,
int qualityoffset=-1,
bool noquality=False,
firstline=None,
int buffersize=100000000
):
if noquality:
return fastqWithoutQualityIterator(lineiterator,
skip,only,
firstline,
buffersize)
else:
return fastqWithQualityIterator(lineiterator,
skip,only,
qualityoffset,
firstline,
buffersize)
def fastqWithQualityIterator(lineiterator,
int skip=0,
only=None,
int qualityoffset=-1,
firstline=None,
int buffersize=100000000
):
cdef LineBuffer lb
cdef str ident
cdef str definition
cdef dict tags
cdef bytes sequence
cdef bytes quality
cdef int skipped, lines_to_skip
cdef int skipped, lines_to_skip, ionly, read
cdef int j
if only is None:
ionly=-1
else:
ionly=int(only)
if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator)
@ -26,28 +55,125 @@ def fastqIterator(lineiterator,
else:
lb=LineBuffer(lineiterator,buffersize)
lines_to_skip = skip*4
skipped = 0
i = iter(lb)
lines_to_skip = skip*4 - (firstline is not None)
for skipped in range(lines_to_skip):
next(i)
if skip > 0:
firstline=None
read=0
if firstline is None:
hline = next(i)
else:
hline = firstline
for line in i:
if skipped < lines_to_skip :
skipped += 1
pass
if read >= ionly:
break
else :
ident,tags,definition = parseHeader(line)
sequence = str2bytes(next(i)[0:-1])
next(i)
quality = str2bytes(next(i)[0:-1])
ident,tags,definition = parseHeader(hline)
sequence = str2bytes(line[0:-1])
next(i)
quality = str2bytes(next(i)[0:-1])
yield { "id" : ident,
"definition" : definition,
"sequence" : sequence,
"quality" : quality,
"tags" : tags,
"annotation" : {}
}
seq = Nuc_Seq(ident,
sequence,
definition,
quality,qualityoffset,
tags)
yield seq
# yield { "id" : ident,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : quality,
# "offset" : qualityoffset,
# "tags" : tags,
# "annotation" : {}
# }
read+=1
hline = next(i)
def fastqWithoutQualityIterator(lineiterator,
int skip=0,
only=None,
firstline=None,
int buffersize=100000000
):
cdef LineBuffer lb
cdef str ident
cdef str definition
cdef dict tags
cdef bytes sequence
cdef bytes quality
cdef int skipped, lines_to_skip, ionly, read
cdef int j
if only is None:
ionly=-1
else:
ionly=int(only)
if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
lb=lineiterator
else:
lb=LineBuffer(lineiterator,buffersize)
i = iter(lb)
lines_to_skip = skip*4 - (firstline is not None)
for skipped in range(lines_to_skip):
next(i)
if skip > 0:
firstline=None
read=0
if firstline is None:
hline = next(i)
else:
hline = firstline
for line in i:
if read >= ionly:
break
ident,tags,definition = parseHeader(hline)
sequence = str2bytes(line[0:-1])
next(i)
next(i)
seq = Nuc_Seq(ident,
sequence,
definition,
None,-1,
tags)
yield seq
# yield { "id" : ident,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : None,
# "offset" : None,
# "tags" : tags,
# "annotation" : {}
# }
read+=1
hline = next(i)