obi import: Skipping sequences is now done through the iterators so that

sequences are not uselessly parsed
This commit is contained in:
Celine Mercier
2017-07-10 17:02:30 +02:00
parent b94ec9557f
commit b6b95f26b6
3 changed files with 73 additions and 33 deletions

View File

@ -155,12 +155,12 @@ def run(config):
if config['import']['seqinformat']=='fasta': if config['import']['seqinformat']=='fasta':
get_quality = False get_quality = False
NUC_SEQS_view = True NUC_SEQS_view = True
iseq = fastaIterator(inputs) iseq = fastaIterator(inputs, skip=config['import']['skip'])
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
elif config['import']['seqinformat']=='fastq': elif config['import']['seqinformat']=='fastq':
get_quality = True get_quality = True
NUC_SEQS_view = True NUC_SEQS_view = True
iseq = fastqIterator(inputs) iseq = fastqIterator(inputs, skip=config['import']['skip'])
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
else: else:
raise RuntimeError('File format not handled') raise RuntimeError('File format not handled')
@ -175,12 +175,9 @@ def run(config):
dcols = {} dcols = {}
skipping = 0
i = 0 i = 0
for seq in iseq : for seq in iseq :
if skipping < config['import']['skip'] : # TODO not efficient because sequences are parsed if i == config['import']['only'] :
skipping+=1
elif i == config['import']['only'] :
break break
else : else :
pb(i) pb(i)

View File

@ -10,7 +10,8 @@ Created on 30 mars 2016
def fastaIterator(lineiterator, def fastaIterator(lineiterator,
int buffersize=100000000 int buffersize=100000000,
int skip=0
): ):
cdef LineBuffer lb cdef LineBuffer lb
cdef str ident cdef str ident
@ -19,6 +20,7 @@ def fastaIterator(lineiterator,
cdef list s cdef list s
cdef bytes sequence cdef bytes sequence
cdef bytes quality cdef bytes quality
cdef int skipped
# cdef OBI_Seq seq # cdef OBI_Seq seq
if isinstance(lineiterator,(str,bytes)): if isinstance(lineiterator,(str,bytes)):
@ -29,10 +31,21 @@ def fastaIterator(lineiterator,
else: else:
lb=LineBuffer(lineiterator,buffersize) lb=LineBuffer(lineiterator,buffersize)
skipped = 0
i = iter(lb) i = iter(lb)
line = next(i) line = next(i)
while True: while True:
while skipped < skip :
line = next(i)
try:
while line[0]!='>':
line = next(i)
except StopIteration:
pass
skipped += 1
ident,tags,definition = parseHeader(line) ident,tags,definition = parseHeader(line)
s = [] s = []
line = next(i) line = next(i)
@ -41,18 +54,19 @@ def fastaIterator(lineiterator,
while line[0]!='>': while line[0]!='>':
s.append(str2bytes(line)[0:-1]) s.append(str2bytes(line)[0:-1])
line = next(i) line = next(i)
except StopIteration: except StopIteration:
pass pass
sequence = b"".join(s) sequence = b"".join(s)
quality = None quality = None
# seq = OBI_Seq(id,
# sequence,
# definition,
# tags=tags,
# )
# seq = OBI_Seq(id,
# sequence,
# definition,
# tags=tags,
# )
yield { "id" : ident, yield { "id" : ident,
"definition" : definition, "definition" : definition,
"sequence" : sequence, "sequence" : sequence,
@ -62,7 +76,10 @@ def fastaIterator(lineiterator,
} }
def fastaNucIterator(lineiterator, int buffersize=100000000): def fastaNucIterator(lineiterator,
int buffersize=100000000,
int skip=0
):
cdef LineBuffer lb cdef LineBuffer lb
cdef str ident cdef str ident
cdef str definition cdef str definition
@ -70,6 +87,7 @@ def fastaNucIterator(lineiterator, int buffersize=100000000):
cdef list s cdef list s
cdef bytes sequence cdef bytes sequence
cdef bytes quality cdef bytes quality
cdef int skipped
# cdef OBI_Seq seq # cdef OBI_Seq seq
if isinstance(lineiterator,(str,bytes)): if isinstance(lineiterator,(str,bytes)):
@ -80,10 +98,22 @@ def fastaNucIterator(lineiterator, int buffersize=100000000):
else: else:
lb=LineBuffer(lineiterator,buffersize) lb=LineBuffer(lineiterator,buffersize)
skipped = 0
i = iter(lb) i = iter(lb)
line = next(i) line = next(i)
while True: while True:
while skipped < skip :
line = next(i)
try:
while line[0]!='>':
line = next(i)
except StopIteration:
pass
skipped += 1
ident,tags,definition = parseHeader(line) ident,tags,definition = parseHeader(line)
s = [] s = []
line = next(i) line = next(i)

View File

@ -6,13 +6,17 @@ Created on 30 mars 2016
@author: coissac @author: coissac
''' '''
def fastqIterator(lineiterator, int buffersize=100000000): def fastqIterator(lineiterator,
int buffersize=100000000,
int skip=0):
cdef LineBuffer lb cdef LineBuffer lb
cdef str ident cdef str ident
cdef str definition cdef str definition
cdef dict tags cdef dict tags
cdef bytes sequence cdef bytes sequence
cdef bytes quality cdef bytes quality
cdef int skipped, lines_to_skip
cdef int j
if isinstance(lineiterator,(str,bytes)): if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator) lineiterator=uopen(lineiterator)
@ -22,8 +26,17 @@ def fastqIterator(lineiterator, int buffersize=100000000):
else: else:
lb=LineBuffer(lineiterator,buffersize) lb=LineBuffer(lineiterator,buffersize)
lines_to_skip = skip*4
skipped = 0
i = iter(lb) i = iter(lb)
for line in i: for line in i:
if skipped < lines_to_skip :
skipped += 1
pass
else :
ident,tags,definition = parseHeader(line) ident,tags,definition = parseHeader(line)
sequence = str2bytes(next(i)[0:-1]) sequence = str2bytes(next(i)[0:-1])
next(i) next(i)