diff --git a/python/obitools3/parsers/fastq.pyx b/python/obitools3/parsers/fastq.pyx index eb3e0f5..e2e58b9 100644 --- a/python/obitools3/parsers/fastq.pyx +++ b/python/obitools3/parsers/fastq.pyx @@ -7,17 +7,46 @@ Created on 30 mars 2016 ''' def fastqIterator(lineiterator, - int buffersize=100000000, - int skip=0): + int skip=0, + only=None, + int qualityoffset=-1, + bool noquality=False, + firstline=None, + int buffersize=100000000 + ): + if noquality: + return fastqWithoutQualityIterator(lineiterator, + skip,only, + firstline, + buffersize) + else: + return fastqWithQualityIterator(lineiterator, + skip,only, + qualityoffset, + firstline, + buffersize) + +def fastqWithQualityIterator(lineiterator, + int skip=0, + only=None, + int qualityoffset=-1, + firstline=None, + int buffersize=100000000 + ): cdef LineBuffer lb cdef str ident cdef str definition cdef dict tags cdef bytes sequence cdef bytes quality - cdef int skipped, lines_to_skip + cdef int skipped, lines_to_skip, ionly, read cdef int j + if only is None: + ionly=-1 + else: + ionly=int(only) + if isinstance(lineiterator,(str,bytes)): lineiterator=uopen(lineiterator) @@ -26,28 +55,125 @@ def fastqIterator(lineiterator, else: lb=LineBuffer(lineiterator,buffersize) - lines_to_skip = skip*4 - skipped = 0 i = iter(lb) + lines_to_skip = skip*4 - (firstline is not None) + for skipped in range(lines_to_skip): + next(i) + + if skip > 0: + firstline=None + + read=0 + + if firstline is None: + hline = next(i) + else: + hline = firstline + + for line in i: - if skipped < lines_to_skip : - skipped += 1 - pass + if read >= ionly: + break + + ident,tags,definition = parseHeader(hline) + sequence = str2bytes(line[0:-1]) + next(i) + quality = str2bytes(next(i)[0:-1]) + + seq = Nuc_Seq(ident, + sequence, + definition, + quality,qualityoffset, + tags) - else : - ident,tags,definition = parseHeader(line) - sequence = str2bytes(next(i)[0:-1]) - next(i) - quality = str2bytes(next(i)[0:-1]) + yield seq + +# yield { "id" : ident, +# "definition" : definition, +# "sequence" : sequence, +# "quality" : quality, +# "offset" : qualityoffset, +# "tags" : tags, +# "annotation" : {} +# } - yield { "id" : ident, - "definition" : definition, - "sequence" : sequence, - "quality" : quality, - "tags" : tags, - "annotation" : {} - } + read+=1 + hline = next(i) + + +def fastqWithoutQualityIterator(lineiterator, + int skip=0, + only=None, + firstline=None, + int buffersize=100000000 + ): + cdef LineBuffer lb + cdef str ident + cdef str definition + cdef dict tags + cdef bytes sequence + cdef bytes quality + cdef int skipped, lines_to_skip, ionly, read + cdef int j + + if only is None: + ionly=-1 + else: + ionly=int(only) + + if isinstance(lineiterator,(str,bytes)): + lineiterator=uopen(lineiterator) + + if isinstance(lineiterator, LineBuffer): + lb=lineiterator + else: + lb=LineBuffer(lineiterator,buffersize) + + i = iter(lb) + lines_to_skip = skip*4 - (firstline is not None) + + for skipped in range(lines_to_skip): + next(i) + + if skip > 0: + firstline=None + + read=0 + + if firstline is None: + hline = next(i) + else: + hline = firstline + + for line in i: + + if read >= ionly: + break + + ident,tags,definition = parseHeader(hline) + sequence = str2bytes(line[0:-1]) + next(i) + next(i) + + seq = Nuc_Seq(ident, + sequence, + definition, + None,-1, + tags) + yield seq + +# yield { "id" : ident, +# "definition" : definition, +# "sequence" : sequence, +# "quality" : None, +# "offset" : None, +# "tags" : tags, +# "annotation" : {} +# } + + read+=1 + hline = next(i)