Complete the fastq iterator to manage new input options

This commit is contained in:
2017-07-27 16:05:17 +02:00
parent 949e5f9baf
commit 8d9cdb4d03

View File

@ -7,17 +7,46 @@ Created on 30 mars 2016
''' '''
def fastqIterator(lineiterator, def fastqIterator(lineiterator,
int buffersize=100000000, int skip=0,
int skip=0): only=None,
int qualityoffset=-1,
bool noquality=False,
firstline=None,
int buffersize=100000000
):
if noquality:
return fastqWithoutQualityIterator(lineiterator,
skip,only,
firstline,
buffersize)
else:
return fastqWithQualityIterator(lineiterator,
skip,only,
qualityoffset,
firstline,
buffersize)
def fastqWithQualityIterator(lineiterator,
int skip=0,
only=None,
int qualityoffset=-1,
firstline=None,
int buffersize=100000000
):
cdef LineBuffer lb cdef LineBuffer lb
cdef str ident cdef str ident
cdef str definition cdef str definition
cdef dict tags cdef dict tags
cdef bytes sequence cdef bytes sequence
cdef bytes quality cdef bytes quality
cdef int skipped, lines_to_skip cdef int skipped, lines_to_skip, ionly, read
cdef int j cdef int j
if only is None:
ionly=-1
else:
ionly=int(only)
if isinstance(lineiterator,(str,bytes)): if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator) lineiterator=uopen(lineiterator)
@ -26,28 +55,125 @@ def fastqIterator(lineiterator,
else: else:
lb=LineBuffer(lineiterator,buffersize) lb=LineBuffer(lineiterator,buffersize)
lines_to_skip = skip*4
skipped = 0
i = iter(lb) i = iter(lb)
lines_to_skip = skip*4 - (firstline is not None)
for skipped in range(lines_to_skip):
next(i)
if skip > 0:
firstline=None
read=0
if firstline is None:
hline = next(i)
else:
hline = firstline
for line in i: for line in i:
if skipped < lines_to_skip : if read >= ionly:
skipped += 1 break
pass
else : ident,tags,definition = parseHeader(hline)
ident,tags,definition = parseHeader(line) sequence = str2bytes(line[0:-1])
sequence = str2bytes(next(i)[0:-1])
next(i) next(i)
quality = str2bytes(next(i)[0:-1]) quality = str2bytes(next(i)[0:-1])
yield { "id" : ident, seq = Nuc_Seq(ident,
"definition" : definition, sequence,
"sequence" : sequence, definition,
"quality" : quality, quality,qualityoffset,
"tags" : tags, tags)
"annotation" : {}
} yield seq
# yield { "id" : ident,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : quality,
# "offset" : qualityoffset,
# "tags" : tags,
# "annotation" : {}
# }
read+=1
hline = next(i)
def fastqWithoutQualityIterator(lineiterator,
int skip=0,
only=None,
firstline=None,
int buffersize=100000000
):
cdef LineBuffer lb
cdef str ident
cdef str definition
cdef dict tags
cdef bytes sequence
cdef bytes quality
cdef int skipped, lines_to_skip, ionly, read
cdef int j
if only is None:
ionly=-1
else:
ionly=int(only)
if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
lb=lineiterator
else:
lb=LineBuffer(lineiterator,buffersize)
i = iter(lb)
lines_to_skip = skip*4 - (firstline is not None)
for skipped in range(lines_to_skip):
next(i)
if skip > 0:
firstline=None
read=0
if firstline is None:
hline = next(i)
else:
hline = firstline
for line in i:
if read >= ionly:
break
ident,tags,definition = parseHeader(hline)
sequence = str2bytes(line[0:-1])
next(i)
next(i)
seq = Nuc_Seq(ident,
sequence,
definition,
None,-1,
tags)
yield seq
# yield { "id" : ident,
# "definition" : definition,
# "sequence" : sequence,
# "quality" : None,
# "offset" : None,
# "tags" : tags,
# "annotation" : {}
# }
read+=1
hline = next(i)