obi import: Skipping sequences is now done through the iterators so that

sequences are not uselessly parsed
This commit is contained in:
Celine Mercier
2017-07-10 17:02:30 +02:00
parent b94ec9557f
commit b6b95f26b6
3 changed files with 73 additions and 33 deletions

View File

@ -155,12 +155,12 @@ def run(config):
if config['import']['seqinformat']=='fasta':
get_quality = False
NUC_SEQS_view = True
iseq = fastaIterator(inputs)
iseq = fastaIterator(inputs, skip=config['import']['skip'])
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
elif config['import']['seqinformat']=='fastq':
get_quality = True
NUC_SEQS_view = True
iseq = fastqIterator(inputs)
iseq = fastqIterator(inputs, skip=config['import']['skip'])
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
else:
raise RuntimeError('File format not handled')
@ -175,12 +175,9 @@ def run(config):
dcols = {}
skipping = 0
i = 0
for seq in iseq :
if skipping < config['import']['skip'] : # TODO not efficient because sequences are parsed
skipping+=1
elif i == config['import']['only'] :
if i == config['import']['only'] :
break
else :
pb(i)

View File

@ -10,7 +10,8 @@ Created on 30 mars 2016
def fastaIterator(lineiterator,
int buffersize=100000000
int buffersize=100000000,
int skip=0
):
cdef LineBuffer lb
cdef str ident
@ -19,6 +20,7 @@ def fastaIterator(lineiterator,
cdef list s
cdef bytes sequence
cdef bytes quality
cdef int skipped
# cdef OBI_Seq seq
if isinstance(lineiterator,(str,bytes)):
@ -29,10 +31,21 @@ def fastaIterator(lineiterator,
else:
lb=LineBuffer(lineiterator,buffersize)
skipped = 0
i = iter(lb)
line = next(i)
while True:
while skipped < skip :
line = next(i)
try:
while line[0]!='>':
line = next(i)
except StopIteration:
pass
skipped += 1
ident,tags,definition = parseHeader(line)
s = []
line = next(i)
@ -41,18 +54,19 @@ def fastaIterator(lineiterator,
while line[0]!='>':
s.append(str2bytes(line)[0:-1])
line = next(i)
except StopIteration:
pass
sequence = b"".join(s)
quality = None
# seq = OBI_Seq(id,
# sequence,
# definition,
# tags=tags,
# )
# seq = OBI_Seq(id,
# sequence,
# definition,
# tags=tags,
# )
yield { "id" : ident,
"definition" : definition,
"sequence" : sequence,
@ -62,7 +76,10 @@ def fastaIterator(lineiterator,
}
def fastaNucIterator(lineiterator, int buffersize=100000000):
def fastaNucIterator(lineiterator,
int buffersize=100000000,
int skip=0
):
cdef LineBuffer lb
cdef str ident
cdef str definition
@ -70,6 +87,7 @@ def fastaNucIterator(lineiterator, int buffersize=100000000):
cdef list s
cdef bytes sequence
cdef bytes quality
cdef int skipped
# cdef OBI_Seq seq
if isinstance(lineiterator,(str,bytes)):
@ -80,10 +98,22 @@ def fastaNucIterator(lineiterator, int buffersize=100000000):
else:
lb=LineBuffer(lineiterator,buffersize)
skipped = 0
i = iter(lb)
line = next(i)
while True:
while skipped < skip :
line = next(i)
try:
while line[0]!='>':
line = next(i)
except StopIteration:
pass
skipped += 1
ident,tags,definition = parseHeader(line)
s = []
line = next(i)

View File

@ -6,13 +6,17 @@ Created on 30 mars 2016
@author: coissac
'''
def fastqIterator(lineiterator, int buffersize=100000000):
def fastqIterator(lineiterator,
int buffersize=100000000,
int skip=0):
cdef LineBuffer lb
cdef str ident
cdef str definition
cdef dict tags
cdef bytes sequence
cdef bytes quality
cdef int skipped, lines_to_skip
cdef int j
if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator)
@ -22,19 +26,28 @@ def fastqIterator(lineiterator, int buffersize=100000000):
else:
lb=LineBuffer(lineiterator,buffersize)
lines_to_skip = skip*4
skipped = 0
i = iter(lb)
for line in i:
ident,tags,definition = parseHeader(line)
sequence = str2bytes(next(i)[0:-1])
next(i)
quality = str2bytes(next(i)[0:-1])
yield { "id" : ident,
"definition" : definition,
"sequence" : sequence,
"quality" : quality,
"tags" : tags,
"annotation" : {}
}
if skipped < lines_to_skip :
skipped += 1
pass
else :
ident,tags,definition = parseHeader(line)
sequence = str2bytes(next(i)[0:-1])
next(i)
quality = str2bytes(next(i)[0:-1])
yield { "id" : ident,
"definition" : definition,
"sequence" : sequence,
"quality" : quality,
"tags" : tags,
"annotation" : {}
}