obi import: Skipping sequences is now done through the iterators so that
sequences are not uselessly parsed
This commit is contained in:
@ -155,12 +155,12 @@ def run(config):
|
||||
if config['import']['seqinformat']=='fasta':
|
||||
get_quality = False
|
||||
NUC_SEQS_view = True
|
||||
iseq = fastaIterator(inputs)
|
||||
iseq = fastaIterator(inputs, skip=config['import']['skip'])
|
||||
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
|
||||
elif config['import']['seqinformat']=='fastq':
|
||||
get_quality = True
|
||||
NUC_SEQS_view = True
|
||||
iseq = fastqIterator(inputs)
|
||||
iseq = fastqIterator(inputs, skip=config['import']['skip'])
|
||||
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
|
||||
else:
|
||||
raise RuntimeError('File format not handled')
|
||||
@ -175,12 +175,9 @@ def run(config):
|
||||
|
||||
dcols = {}
|
||||
|
||||
skipping = 0
|
||||
i = 0
|
||||
for seq in iseq :
|
||||
if skipping < config['import']['skip'] : # TODO not efficient because sequences are parsed
|
||||
skipping+=1
|
||||
elif i == config['import']['only'] :
|
||||
if i == config['import']['only'] :
|
||||
break
|
||||
else :
|
||||
pb(i)
|
||||
|
@ -10,7 +10,8 @@ Created on 30 mars 2016
|
||||
|
||||
|
||||
def fastaIterator(lineiterator,
|
||||
int buffersize=100000000
|
||||
int buffersize=100000000,
|
||||
int skip=0
|
||||
):
|
||||
cdef LineBuffer lb
|
||||
cdef str ident
|
||||
@ -19,6 +20,7 @@ def fastaIterator(lineiterator,
|
||||
cdef list s
|
||||
cdef bytes sequence
|
||||
cdef bytes quality
|
||||
cdef int skipped
|
||||
# cdef OBI_Seq seq
|
||||
|
||||
if isinstance(lineiterator,(str,bytes)):
|
||||
@ -28,31 +30,43 @@ def fastaIterator(lineiterator,
|
||||
lb=lineiterator
|
||||
else:
|
||||
lb=LineBuffer(lineiterator,buffersize)
|
||||
|
||||
|
||||
skipped = 0
|
||||
i = iter(lb)
|
||||
line = next(i)
|
||||
|
||||
while True:
|
||||
|
||||
while skipped < skip :
|
||||
line = next(i)
|
||||
try:
|
||||
while line[0]!='>':
|
||||
line = next(i)
|
||||
except StopIteration:
|
||||
pass
|
||||
skipped += 1
|
||||
|
||||
ident,tags,definition = parseHeader(line)
|
||||
s = []
|
||||
line = next(i)
|
||||
|
||||
|
||||
try:
|
||||
while line[0]!='>':
|
||||
s.append(str2bytes(line)[0:-1])
|
||||
line = next(i)
|
||||
|
||||
except StopIteration:
|
||||
pass
|
||||
|
||||
sequence = b"".join(s)
|
||||
quality = None
|
||||
|
||||
|
||||
# seq = OBI_Seq(id,
|
||||
# sequence,
|
||||
# definition,
|
||||
# tags=tags,
|
||||
# )
|
||||
|
||||
# seq = OBI_Seq(id,
|
||||
# sequence,
|
||||
# definition,
|
||||
# tags=tags,
|
||||
# )
|
||||
|
||||
yield { "id" : ident,
|
||||
"definition" : definition,
|
||||
"sequence" : sequence,
|
||||
@ -60,9 +74,12 @@ def fastaIterator(lineiterator,
|
||||
"tags" : tags,
|
||||
"annotation" : {}
|
||||
}
|
||||
|
||||
|
||||
|
||||
def fastaNucIterator(lineiterator, int buffersize=100000000):
|
||||
def fastaNucIterator(lineiterator,
|
||||
int buffersize=100000000,
|
||||
int skip=0
|
||||
):
|
||||
cdef LineBuffer lb
|
||||
cdef str ident
|
||||
cdef str definition
|
||||
@ -70,6 +87,7 @@ def fastaNucIterator(lineiterator, int buffersize=100000000):
|
||||
cdef list s
|
||||
cdef bytes sequence
|
||||
cdef bytes quality
|
||||
cdef int skipped
|
||||
# cdef OBI_Seq seq
|
||||
|
||||
if isinstance(lineiterator,(str,bytes)):
|
||||
@ -79,11 +97,23 @@ def fastaNucIterator(lineiterator, int buffersize=100000000):
|
||||
lb=lineiterator
|
||||
else:
|
||||
lb=LineBuffer(lineiterator,buffersize)
|
||||
|
||||
|
||||
|
||||
skipped = 0
|
||||
i = iter(lb)
|
||||
line = next(i)
|
||||
|
||||
while True:
|
||||
|
||||
while skipped < skip :
|
||||
line = next(i)
|
||||
try:
|
||||
while line[0]!='>':
|
||||
line = next(i)
|
||||
except StopIteration:
|
||||
pass
|
||||
skipped += 1
|
||||
|
||||
ident,tags,definition = parseHeader(line)
|
||||
s = []
|
||||
line = next(i)
|
||||
|
@ -6,13 +6,17 @@ Created on 30 mars 2016
|
||||
@author: coissac
|
||||
'''
|
||||
|
||||
def fastqIterator(lineiterator, int buffersize=100000000):
|
||||
def fastqIterator(lineiterator,
|
||||
int buffersize=100000000,
|
||||
int skip=0):
|
||||
cdef LineBuffer lb
|
||||
cdef str ident
|
||||
cdef str definition
|
||||
cdef dict tags
|
||||
cdef bytes sequence
|
||||
cdef bytes quality
|
||||
cdef int skipped, lines_to_skip
|
||||
cdef int j
|
||||
|
||||
if isinstance(lineiterator,(str,bytes)):
|
||||
lineiterator=uopen(lineiterator)
|
||||
@ -21,20 +25,29 @@ def fastqIterator(lineiterator, int buffersize=100000000):
|
||||
lb=lineiterator
|
||||
else:
|
||||
lb=LineBuffer(lineiterator,buffersize)
|
||||
|
||||
|
||||
lines_to_skip = skip*4
|
||||
skipped = 0
|
||||
i = iter(lb)
|
||||
|
||||
for line in i:
|
||||
ident,tags,definition = parseHeader(line)
|
||||
sequence = str2bytes(next(i)[0:-1])
|
||||
next(i)
|
||||
quality = str2bytes(next(i)[0:-1])
|
||||
|
||||
yield { "id" : ident,
|
||||
"definition" : definition,
|
||||
"sequence" : sequence,
|
||||
"quality" : quality,
|
||||
"tags" : tags,
|
||||
"annotation" : {}
|
||||
}
|
||||
|
||||
if skipped < lines_to_skip :
|
||||
skipped += 1
|
||||
pass
|
||||
|
||||
else :
|
||||
ident,tags,definition = parseHeader(line)
|
||||
sequence = str2bytes(next(i)[0:-1])
|
||||
next(i)
|
||||
quality = str2bytes(next(i)[0:-1])
|
||||
|
||||
yield { "id" : ident,
|
||||
"definition" : definition,
|
||||
"sequence" : sequence,
|
||||
"quality" : quality,
|
||||
"tags" : tags,
|
||||
"annotation" : {}
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user