obi import: Skipping sequences is now done through the iterators so that
sequences are not uselessly parsed
This commit is contained in:
@ -155,12 +155,12 @@ def run(config):
|
|||||||
if config['import']['seqinformat']=='fasta':
|
if config['import']['seqinformat']=='fasta':
|
||||||
get_quality = False
|
get_quality = False
|
||||||
NUC_SEQS_view = True
|
NUC_SEQS_view = True
|
||||||
iseq = fastaIterator(inputs)
|
iseq = fastaIterator(inputs, skip=config['import']['skip'])
|
||||||
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
|
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
|
||||||
elif config['import']['seqinformat']=='fastq':
|
elif config['import']['seqinformat']=='fastq':
|
||||||
get_quality = True
|
get_quality = True
|
||||||
NUC_SEQS_view = True
|
NUC_SEQS_view = True
|
||||||
iseq = fastqIterator(inputs)
|
iseq = fastqIterator(inputs, skip=config['import']['skip'])
|
||||||
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
|
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
|
||||||
else:
|
else:
|
||||||
raise RuntimeError('File format not handled')
|
raise RuntimeError('File format not handled')
|
||||||
@ -175,12 +175,9 @@ def run(config):
|
|||||||
|
|
||||||
dcols = {}
|
dcols = {}
|
||||||
|
|
||||||
skipping = 0
|
|
||||||
i = 0
|
i = 0
|
||||||
for seq in iseq :
|
for seq in iseq :
|
||||||
if skipping < config['import']['skip'] : # TODO not efficient because sequences are parsed
|
if i == config['import']['only'] :
|
||||||
skipping+=1
|
|
||||||
elif i == config['import']['only'] :
|
|
||||||
break
|
break
|
||||||
else :
|
else :
|
||||||
pb(i)
|
pb(i)
|
||||||
|
@ -10,7 +10,8 @@ Created on 30 mars 2016
|
|||||||
|
|
||||||
|
|
||||||
def fastaIterator(lineiterator,
|
def fastaIterator(lineiterator,
|
||||||
int buffersize=100000000
|
int buffersize=100000000,
|
||||||
|
int skip=0
|
||||||
):
|
):
|
||||||
cdef LineBuffer lb
|
cdef LineBuffer lb
|
||||||
cdef str ident
|
cdef str ident
|
||||||
@ -19,6 +20,7 @@ def fastaIterator(lineiterator,
|
|||||||
cdef list s
|
cdef list s
|
||||||
cdef bytes sequence
|
cdef bytes sequence
|
||||||
cdef bytes quality
|
cdef bytes quality
|
||||||
|
cdef int skipped
|
||||||
# cdef OBI_Seq seq
|
# cdef OBI_Seq seq
|
||||||
|
|
||||||
if isinstance(lineiterator,(str,bytes)):
|
if isinstance(lineiterator,(str,bytes)):
|
||||||
@ -29,10 +31,21 @@ def fastaIterator(lineiterator,
|
|||||||
else:
|
else:
|
||||||
lb=LineBuffer(lineiterator,buffersize)
|
lb=LineBuffer(lineiterator,buffersize)
|
||||||
|
|
||||||
|
skipped = 0
|
||||||
i = iter(lb)
|
i = iter(lb)
|
||||||
line = next(i)
|
line = next(i)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
|
while skipped < skip :
|
||||||
|
line = next(i)
|
||||||
|
try:
|
||||||
|
while line[0]!='>':
|
||||||
|
line = next(i)
|
||||||
|
except StopIteration:
|
||||||
|
pass
|
||||||
|
skipped += 1
|
||||||
|
|
||||||
ident,tags,definition = parseHeader(line)
|
ident,tags,definition = parseHeader(line)
|
||||||
s = []
|
s = []
|
||||||
line = next(i)
|
line = next(i)
|
||||||
@ -41,18 +54,19 @@ def fastaIterator(lineiterator,
|
|||||||
while line[0]!='>':
|
while line[0]!='>':
|
||||||
s.append(str2bytes(line)[0:-1])
|
s.append(str2bytes(line)[0:-1])
|
||||||
line = next(i)
|
line = next(i)
|
||||||
|
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
sequence = b"".join(s)
|
sequence = b"".join(s)
|
||||||
quality = None
|
quality = None
|
||||||
|
|
||||||
|
# seq = OBI_Seq(id,
|
||||||
|
# sequence,
|
||||||
|
# definition,
|
||||||
|
# tags=tags,
|
||||||
|
# )
|
||||||
|
|
||||||
# seq = OBI_Seq(id,
|
|
||||||
# sequence,
|
|
||||||
# definition,
|
|
||||||
# tags=tags,
|
|
||||||
# )
|
|
||||||
yield { "id" : ident,
|
yield { "id" : ident,
|
||||||
"definition" : definition,
|
"definition" : definition,
|
||||||
"sequence" : sequence,
|
"sequence" : sequence,
|
||||||
@ -62,7 +76,10 @@ def fastaIterator(lineiterator,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def fastaNucIterator(lineiterator, int buffersize=100000000):
|
def fastaNucIterator(lineiterator,
|
||||||
|
int buffersize=100000000,
|
||||||
|
int skip=0
|
||||||
|
):
|
||||||
cdef LineBuffer lb
|
cdef LineBuffer lb
|
||||||
cdef str ident
|
cdef str ident
|
||||||
cdef str definition
|
cdef str definition
|
||||||
@ -70,6 +87,7 @@ def fastaNucIterator(lineiterator, int buffersize=100000000):
|
|||||||
cdef list s
|
cdef list s
|
||||||
cdef bytes sequence
|
cdef bytes sequence
|
||||||
cdef bytes quality
|
cdef bytes quality
|
||||||
|
cdef int skipped
|
||||||
# cdef OBI_Seq seq
|
# cdef OBI_Seq seq
|
||||||
|
|
||||||
if isinstance(lineiterator,(str,bytes)):
|
if isinstance(lineiterator,(str,bytes)):
|
||||||
@ -80,10 +98,22 @@ def fastaNucIterator(lineiterator, int buffersize=100000000):
|
|||||||
else:
|
else:
|
||||||
lb=LineBuffer(lineiterator,buffersize)
|
lb=LineBuffer(lineiterator,buffersize)
|
||||||
|
|
||||||
|
|
||||||
|
skipped = 0
|
||||||
i = iter(lb)
|
i = iter(lb)
|
||||||
line = next(i)
|
line = next(i)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
|
while skipped < skip :
|
||||||
|
line = next(i)
|
||||||
|
try:
|
||||||
|
while line[0]!='>':
|
||||||
|
line = next(i)
|
||||||
|
except StopIteration:
|
||||||
|
pass
|
||||||
|
skipped += 1
|
||||||
|
|
||||||
ident,tags,definition = parseHeader(line)
|
ident,tags,definition = parseHeader(line)
|
||||||
s = []
|
s = []
|
||||||
line = next(i)
|
line = next(i)
|
||||||
|
@ -6,13 +6,17 @@ Created on 30 mars 2016
|
|||||||
@author: coissac
|
@author: coissac
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def fastqIterator(lineiterator, int buffersize=100000000):
|
def fastqIterator(lineiterator,
|
||||||
|
int buffersize=100000000,
|
||||||
|
int skip=0):
|
||||||
cdef LineBuffer lb
|
cdef LineBuffer lb
|
||||||
cdef str ident
|
cdef str ident
|
||||||
cdef str definition
|
cdef str definition
|
||||||
cdef dict tags
|
cdef dict tags
|
||||||
cdef bytes sequence
|
cdef bytes sequence
|
||||||
cdef bytes quality
|
cdef bytes quality
|
||||||
|
cdef int skipped, lines_to_skip
|
||||||
|
cdef int j
|
||||||
|
|
||||||
if isinstance(lineiterator,(str,bytes)):
|
if isinstance(lineiterator,(str,bytes)):
|
||||||
lineiterator=uopen(lineiterator)
|
lineiterator=uopen(lineiterator)
|
||||||
@ -22,19 +26,28 @@ def fastqIterator(lineiterator, int buffersize=100000000):
|
|||||||
else:
|
else:
|
||||||
lb=LineBuffer(lineiterator,buffersize)
|
lb=LineBuffer(lineiterator,buffersize)
|
||||||
|
|
||||||
|
lines_to_skip = skip*4
|
||||||
|
skipped = 0
|
||||||
i = iter(lb)
|
i = iter(lb)
|
||||||
|
|
||||||
for line in i:
|
for line in i:
|
||||||
ident,tags,definition = parseHeader(line)
|
|
||||||
sequence = str2bytes(next(i)[0:-1])
|
|
||||||
next(i)
|
|
||||||
quality = str2bytes(next(i)[0:-1])
|
|
||||||
|
|
||||||
yield { "id" : ident,
|
if skipped < lines_to_skip :
|
||||||
"definition" : definition,
|
skipped += 1
|
||||||
"sequence" : sequence,
|
pass
|
||||||
"quality" : quality,
|
|
||||||
"tags" : tags,
|
else :
|
||||||
"annotation" : {}
|
ident,tags,definition = parseHeader(line)
|
||||||
}
|
sequence = str2bytes(next(i)[0:-1])
|
||||||
|
next(i)
|
||||||
|
quality = str2bytes(next(i)[0:-1])
|
||||||
|
|
||||||
|
yield { "id" : ident,
|
||||||
|
"definition" : definition,
|
||||||
|
"sequence" : sequence,
|
||||||
|
"quality" : quality,
|
||||||
|
"tags" : tags,
|
||||||
|
"annotation" : {}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user