Cython: fasta and fastq parsers now return bytes and take NA string
argument
This commit is contained in:
@ -7,10 +7,11 @@ from obitools3.utils cimport bytes2str
|
||||
|
||||
cdef class FastaFormat:
|
||||
|
||||
def __init__(self, list tags=[], bint printNAKeys=False):
|
||||
def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
|
||||
self.headerFormatter = HeaderFormat("fasta",
|
||||
tags,
|
||||
printNAKeys)
|
||||
tags=tags,
|
||||
printNAKeys=printNAKeys,
|
||||
NAString=NAString)
|
||||
|
||||
@cython.boundscheck(False)
|
||||
def __call__(self, object data):
|
||||
@ -24,5 +25,5 @@ cdef class FastaFormat:
|
||||
|
||||
brawseq = b'\n'.join(lines)
|
||||
|
||||
return bytes2str(self.headerFormatter(data) + b"\n" + brawseq)
|
||||
return self.headerFormatter(data) + b"\n" + brawseq
|
||||
|
||||
|
@ -5,28 +5,25 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN
|
||||
from obitools3.utils cimport bytes2str, str2bytes, tobytes
|
||||
|
||||
|
||||
# TODO quality offset option
|
||||
# TODO quality offset option?
|
||||
cdef class FastqFormat:
|
||||
|
||||
def __init__(self, list tags=[], bint printNAKeys=False):
|
||||
def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
|
||||
self.headerFormatter = HeaderFormat("fastq",
|
||||
tags,
|
||||
printNAKeys)
|
||||
tags=tags,
|
||||
printNAKeys=printNAKeys,
|
||||
NAString=NAString)
|
||||
|
||||
@cython.boundscheck(False)
|
||||
def __call__(self, object data):
|
||||
|
||||
cdef bytes quality
|
||||
|
||||
if hasattr(data, "quality_str"):
|
||||
quality = str2bytes(data.quality_str) # TODO quality_bytes property
|
||||
if hasattr(data, "quality_bytes"):
|
||||
quality = data.quality_bytes
|
||||
elif hasattr(data, "quality"):
|
||||
quality = tobytes(data.quality)
|
||||
else:
|
||||
raise AttributeError("No quality when exporting to fastq") # TODO discuss
|
||||
|
||||
return bytes2str(self.headerFormatter(data) +
|
||||
b"\n" +
|
||||
data[NUC_SEQUENCE_COLUMN] +
|
||||
b"\n+\n" +
|
||||
quality)
|
||||
return self.headerFormatter(data) + b"\n" + data[NUC_SEQUENCE_COLUMN] + b"\n+\n" + quality
|
||||
|
@ -2,6 +2,7 @@ cdef class HeaderFormat:
|
||||
|
||||
cdef bytes start
|
||||
cdef set tags
|
||||
cdef bint printNaKeys
|
||||
cdef bint printNAKeys
|
||||
cdef bytes NAString
|
||||
cdef size_t headerBufferLength
|
||||
|
@ -7,13 +7,14 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
|
||||
COUNT_COLUMN
|
||||
|
||||
from obitools3.utils cimport str2bytes
|
||||
from obitools3.dms.column.column cimport Column_line
|
||||
|
||||
|
||||
cdef class HeaderFormat:
|
||||
|
||||
SPECIAL_KEYS = [NUC_SEQUENCE_COLUMN, ID_COLUMN, DEFINITION_COLUMN, QUALITY_COLUMN]
|
||||
|
||||
def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False):
|
||||
def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
|
||||
'''
|
||||
@param format:
|
||||
@type format: `str`
|
||||
@ -23,10 +24,14 @@ cdef class HeaderFormat:
|
||||
|
||||
@param printNAKeys:
|
||||
@type printNAKeys: `bool`
|
||||
|
||||
@param NAString:
|
||||
@type NAString: `bytes`
|
||||
'''
|
||||
|
||||
self.tags = set(tags)
|
||||
self.printNaKeys = printNAKeys
|
||||
self.printNAKeys = printNAKeys
|
||||
self.NAString = NAString
|
||||
|
||||
if format=="fasta":
|
||||
self.start=b">"
|
||||
@ -43,7 +48,6 @@ cdef class HeaderFormat:
|
||||
cdef list lines = [b""]
|
||||
cdef bytes tagline
|
||||
|
||||
|
||||
if self.tags is not None and self.tags:
|
||||
ktags = self.tags
|
||||
else:
|
||||
@ -52,8 +56,16 @@ cdef class HeaderFormat:
|
||||
for k in ktags:
|
||||
if k in tags:
|
||||
value = data[k]
|
||||
if value is not None or self.printNaKeys:
|
||||
lines.append(k + b"=" + str2bytes(str(data[k]))) #TODO bytes() method on values (str equivalent)
|
||||
if value is None:
|
||||
if self.printNAKeys:
|
||||
value = self.NAString
|
||||
else:
|
||||
if type(value) == Column_line:
|
||||
value = value.bytes()
|
||||
else:
|
||||
value = str2bytes(str(value)) # TODO ugly but how else?
|
||||
if value is not None:
|
||||
lines.append(k + b"=" + value + b";")
|
||||
|
||||
if len(lines) > 1:
|
||||
tagline=b" ".join(lines)
|
||||
|
Reference in New Issue
Block a user