From b6b95f26b6d4b1af4ea6c3064fcf473f5c7bd0f8 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 10 Jul 2017 17:02:30 +0200 Subject: [PATCH] obi import: Skipping sequences is now done through the iterators so that sequences are not uselessly parsed --- python/obitools3/commands/import.pyx | 9 ++--- python/obitools3/parsers/fasta.pyx | 56 +++++++++++++++++++++------- python/obitools3/parsers/fastq.pyx | 41 +++++++++++++------- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index b5a7de0..1c66471 100644 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -155,12 +155,12 @@ def run(config): if config['import']['seqinformat']=='fasta': get_quality = False NUC_SEQS_view = True - iseq = fastaIterator(inputs) + iseq = fastaIterator(inputs, skip=config['import']['skip']) view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) elif config['import']['seqinformat']=='fastq': get_quality = True NUC_SEQS_view = True - iseq = fastqIterator(inputs) + iseq = fastqIterator(inputs, skip=config['import']['skip']) view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) else: raise RuntimeError('File format not handled') @@ -175,12 +175,9 @@ def run(config): dcols = {} - skipping = 0 i = 0 for seq in iseq : - if skipping < config['import']['skip'] : # TODO not efficient because sequences are parsed - skipping+=1 - elif i == config['import']['only'] : + if i == config['import']['only'] : break else : pb(i) diff --git a/python/obitools3/parsers/fasta.pyx b/python/obitools3/parsers/fasta.pyx index 685b865..38ac738 100644 --- a/python/obitools3/parsers/fasta.pyx +++ b/python/obitools3/parsers/fasta.pyx @@ -10,7 +10,8 @@ Created on 30 mars 2016 def fastaIterator(lineiterator, - int buffersize=100000000 + int buffersize=100000000, + int skip=0 ): cdef LineBuffer lb cdef str ident @@ -19,6 +20,7 @@ def fastaIterator(lineiterator, cdef list s cdef bytes sequence cdef bytes quality + cdef int skipped # cdef OBI_Seq seq if isinstance(lineiterator,(str,bytes)): @@ -28,31 +30,43 @@ def fastaIterator(lineiterator, lb=lineiterator else: lb=LineBuffer(lineiterator,buffersize) - + + skipped = 0 i = iter(lb) line = next(i) while True: + + while skipped < skip : + line = next(i) + try: + while line[0]!='>': + line = next(i) + except StopIteration: + pass + skipped += 1 + ident,tags,definition = parseHeader(line) s = [] line = next(i) - + try: while line[0]!='>': s.append(str2bytes(line)[0:-1]) line = next(i) + except StopIteration: pass sequence = b"".join(s) quality = None - - -# seq = OBI_Seq(id, -# sequence, -# definition, -# tags=tags, -# ) + + # seq = OBI_Seq(id, + # sequence, + # definition, + # tags=tags, + # ) + yield { "id" : ident, "definition" : definition, "sequence" : sequence, @@ -60,9 +74,12 @@ def fastaIterator(lineiterator, "tags" : tags, "annotation" : {} } - + -def fastaNucIterator(lineiterator, int buffersize=100000000): +def fastaNucIterator(lineiterator, + int buffersize=100000000, + int skip=0 + ): cdef LineBuffer lb cdef str ident cdef str definition @@ -70,6 +87,7 @@ def fastaNucIterator(lineiterator, int buffersize=100000000): cdef list s cdef bytes sequence cdef bytes quality + cdef int skipped # cdef OBI_Seq seq if isinstance(lineiterator,(str,bytes)): @@ -79,11 +97,23 @@ def fastaNucIterator(lineiterator, int buffersize=100000000): lb=lineiterator else: lb=LineBuffer(lineiterator,buffersize) - + + + skipped = 0 i = iter(lb) line = next(i) while True: + + while skipped < skip : + line = next(i) + try: + while line[0]!='>': + line = next(i) + except StopIteration: + pass + skipped += 1 + ident,tags,definition = parseHeader(line) s = [] line = next(i) diff --git a/python/obitools3/parsers/fastq.pyx b/python/obitools3/parsers/fastq.pyx index 2c600e4..eb3e0f5 100644 --- a/python/obitools3/parsers/fastq.pyx +++ b/python/obitools3/parsers/fastq.pyx @@ -6,13 +6,17 @@ Created on 30 mars 2016 @author: coissac ''' -def fastqIterator(lineiterator, int buffersize=100000000): +def fastqIterator(lineiterator, + int buffersize=100000000, + int skip=0): cdef LineBuffer lb cdef str ident cdef str definition cdef dict tags cdef bytes sequence cdef bytes quality + cdef int skipped, lines_to_skip + cdef int j if isinstance(lineiterator,(str,bytes)): lineiterator=uopen(lineiterator) @@ -21,20 +25,29 @@ def fastqIterator(lineiterator, int buffersize=100000000): lb=lineiterator else: lb=LineBuffer(lineiterator,buffersize) - + + lines_to_skip = skip*4 + skipped = 0 i = iter(lb) + for line in i: - ident,tags,definition = parseHeader(line) - sequence = str2bytes(next(i)[0:-1]) - next(i) - quality = str2bytes(next(i)[0:-1]) - - yield { "id" : ident, - "definition" : definition, - "sequence" : sequence, - "quality" : quality, - "tags" : tags, - "annotation" : {} - } + + if skipped < lines_to_skip : + skipped += 1 + pass + + else : + ident,tags,definition = parseHeader(line) + sequence = str2bytes(next(i)[0:-1]) + next(i) + quality = str2bytes(next(i)[0:-1]) + + yield { "id" : ident, + "definition" : definition, + "sequence" : sequence, + "quality" : quality, + "tags" : tags, + "annotation" : {} + }