Cython API: Various fixes in input handlers (parsers, openers etc).

Mostly working but not bug-free
This commit is contained in:
Celine Mercier
2017-08-20 17:37:51 +02:00
parent c559ddf487
commit 74f15d1a23
4 changed files with 75 additions and 67 deletions

View File

@ -6,7 +6,9 @@ Created on 30 mars 2016
@author: coissac @author: coissac
''' '''
from obitools3.dms.obiseq import Nuc_Seq import types
from obitools3.dms.obiseq cimport Nuc_Seq
def fastaIterator(lineiterator, def fastaIterator(lineiterator,
@ -48,7 +50,7 @@ def fastaIterator(lineiterator,
while True: while True:
if read >= ionly: if ionly >= 0 and read >= ionly:
break break
while skipped < skip : while skipped < skip :
@ -79,7 +81,7 @@ def fastaIterator(lineiterator,
# definition, # definition,
# tags=tags, # tags=tags,
# ) # )
# TODO
yield { "id" : ident, yield { "id" : ident,
"definition" : definition, "definition" : definition,
"sequence" : sequence, "sequence" : sequence,
@ -105,64 +107,64 @@ def fastaNucIterator(lineiterator,
cdef list s cdef list s
cdef bytes sequence cdef bytes sequence
cdef int lines_to_skip, ionly, read cdef int lines_to_skip, ionly, read
# cdef OBI_Seq seq cdef Nuc_Seq seq
if only is None: if only is None:
ionly=-1 ionly = -1
else: else:
ionly=int(only) ionly = int(only)
if isinstance(lineiterator,(str,bytes)): if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator) lineiterator=uopen(lineiterator)
if isinstance(lineiterator, types.GeneratorType):
iterator = lineiterator
if isinstance(lineiterator, LineBuffer): if isinstance(lineiterator, LineBuffer):
lb=lineiterator iterator = iter(lineiterator)
else: else:
lb=LineBuffer(lineiterator,buffersize) iterator = iter(LineBuffer(lineiterator, buffersize))
skipped = 0 skipped = 0
read = 0 read = 0
i = iter(lb)
if firstline is None: if firstline is None:
line = next(i) line = next(iterator)
else: else:
line = firstline line = firstline
while True: while True:
if read >= ionly: if ionly >= 0 and read >= ionly:
break break
while skipped < skip : while skipped < skip :
line = next(i) line = next(iterator)
try: try:
while line[0]!='>': while line[0]!='>':
line = next(i) line = next(iterator)
except StopIteration: except StopIteration:
pass pass
skipped += 1 skipped += 1
ident,tags,definition = parseHeader(line) ident,tags,definition = parseHeader(line)
s = [] s = []
line = next(i) line = next(iterator)
try: try:
while line[0]!='>': while line[0]!='>':
s.append(str2bytes(line)[0:-1]) s.append(str2bytes(line)[0:-1])
line = next(i) line = next(iterator)
except StopIteration: except StopIteration:
pass pass
sequence = b"".join(s) sequence = b"".join(s)
# seq =
seq = Nuc_Seq(ident, seq = Nuc_Seq(ident,
sequence, sequence,
definition, definition=definition,
None,-1, quality=None,
tags) offset=-1,
tags=tags)
yield seq yield seq

View File

@ -6,7 +6,7 @@ Created on 30 mars 2016
@author: coissac @author: coissac
''' '''
from obitools3.dms.obiseq import Nuc_Seq from obitools3.dms.obiseq cimport Nuc_Seq
def fastqIterator(lineiterator, def fastqIterator(lineiterator,
@ -74,10 +74,9 @@ def fastqWithQualityIterator(lineiterator,
else: else:
hline = firstline hline = firstline
for line in i: for line in i:
if read >= ionly: if ionly >= 0 and read >= ionly:
break break
ident,tags,definition = parseHeader(hline) ident,tags,definition = parseHeader(hline)
@ -87,9 +86,10 @@ def fastqWithQualityIterator(lineiterator,
seq = Nuc_Seq(ident, seq = Nuc_Seq(ident,
sequence, sequence,
definition, definition=definition,
quality,qualityoffset, quality=quality,
tags) offset=qualityoffset,
tags=tags)
yield seq yield seq
@ -152,7 +152,7 @@ def fastqWithoutQualityIterator(lineiterator,
for line in i: for line in i:
if read >= ionly: if ionly >= 0 and read >= ionly:
break break
ident,tags,definition = parseHeader(hline) ident,tags,definition = parseHeader(hline)
@ -162,9 +162,10 @@ def fastqWithoutQualityIterator(lineiterator,
seq = Nuc_Seq(ident, seq = Nuc_Seq(ident,
sequence, sequence,
definition, definition=definition,
None,-1, quality=None,
tags) offset=-1,
tags=tags)
yield seq yield seq

View File

@ -41,7 +41,7 @@ def entryIteratorFactory(lineiterator,
if isinstance(lineiterator, LineBuffer): if isinstance(lineiterator, LineBuffer):
lb=lineiterator lb=lineiterator
else: else:
lb=LineBuffer(lineiterator,buffersize) lb=LineBuffer(lineiterator, buffersize)
i = iter(lb) i = iter(lb)
@ -62,25 +62,28 @@ def entryIteratorFactory(lineiterator,
elif is_ngsfilter_line(first): elif is_ngsfilter_line(first):
format=b"ngsfilter" format=b"ngsfilter"
# TODO Temporary fix
first=None
lineiterator.seek(0)
if format==b'fasta': if format==b'fasta':
if seqtype == b'nuc': if seqtype == b'nuc':
return (fastaNucIterator(lineiterator, return (fastaNucIterator(lineiterator,
skip,only, skip=skip,only=only,
first), firstline=first,
buffersize=buffersize),
Nuc_Seq) Nuc_Seq)
else: else:
raise NotImplementedError() raise NotImplementedError()
elif format==b'fastq': elif format==b'fastq':
return (fastqIterator(lineiterator, return (fastqIterator(lineiterator,
skip,only, skip=skip,only=only,
qualityoffset, qualityoffset=qualityoffset,
first), noquality=noquality,
firstline=first,
buffersize=buffersize),
Nuc_Seq) Nuc_Seq)
raise NotImplementedError('File format not yet implemented') raise NotImplementedError('File format not yet implemented')

View File

@ -364,19 +364,22 @@ def open_uri(uri,
if qualifiers[b"seqtype"]==b"nuc": if qualifiers[b"seqtype"]==b"nuc":
objclass = Nuc_Seq objclass = Nuc_Seq
if format==b"fasta": if format==b"fasta":
iseq = fastaNucIterator(file,skip,only) iseq = fastaNucIterator(file,
skip=skip,
only=only)
elif format==b"fastq": elif format==b"fastq":
iseq = fastqIterator(file, iseq = fastqIterator(file,
skip,only, skip=skip,
offset, only=only,
noquality) offset=offset,
noquality=noquality)
else: else:
raise NotImplementedError('Sequence file format not implemented') raise NotImplementedError('Sequence file format not implemented')
elif qualifiers[b"seqtype"]==b"prot": elif qualifiers[b"seqtype"]==b"prot":
raise NotImplementedError() raise NotImplementedError()
else: else:
iseq,objclass = entryIteratorFactory(file, iseq,objclass = entryIteratorFactory(file,
skip,only, skip, only,
seqtype, seqtype,
offset, offset,
noquality, noquality,
@ -389,10 +392,9 @@ def open_uri(uri,
blanklineskip, blanklineskip,
commentchar) commentchar)
#tmpdms = get_temp_dms()
tmpdms = get_temp_dms() return (file, iseq, objclass, urib)
return (file,iseq,objclass,urib)