Cython: fasta and fastq parsers now return bytes and take NA string
argument
This commit is contained in:
@ -7,10 +7,11 @@ from obitools3.utils cimport bytes2str
|
|||||||
|
|
||||||
cdef class FastaFormat:
|
cdef class FastaFormat:
|
||||||
|
|
||||||
def __init__(self, list tags=[], bint printNAKeys=False):
|
def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
|
||||||
self.headerFormatter = HeaderFormat("fasta",
|
self.headerFormatter = HeaderFormat("fasta",
|
||||||
tags,
|
tags=tags,
|
||||||
printNAKeys)
|
printNAKeys=printNAKeys,
|
||||||
|
NAString=NAString)
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
def __call__(self, object data):
|
def __call__(self, object data):
|
||||||
@ -24,5 +25,5 @@ cdef class FastaFormat:
|
|||||||
|
|
||||||
brawseq = b'\n'.join(lines)
|
brawseq = b'\n'.join(lines)
|
||||||
|
|
||||||
return bytes2str(self.headerFormatter(data) + b"\n" + brawseq)
|
return self.headerFormatter(data) + b"\n" + brawseq
|
||||||
|
|
||||||
|
@ -5,28 +5,25 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN
|
|||||||
from obitools3.utils cimport bytes2str, str2bytes, tobytes
|
from obitools3.utils cimport bytes2str, str2bytes, tobytes
|
||||||
|
|
||||||
|
|
||||||
# TODO quality offset option
|
# TODO quality offset option?
|
||||||
cdef class FastqFormat:
|
cdef class FastqFormat:
|
||||||
|
|
||||||
def __init__(self, list tags=[], bint printNAKeys=False):
|
def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
|
||||||
self.headerFormatter = HeaderFormat("fastq",
|
self.headerFormatter = HeaderFormat("fastq",
|
||||||
tags,
|
tags=tags,
|
||||||
printNAKeys)
|
printNAKeys=printNAKeys,
|
||||||
|
NAString=NAString)
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
def __call__(self, object data):
|
def __call__(self, object data):
|
||||||
|
|
||||||
cdef bytes quality
|
cdef bytes quality
|
||||||
|
|
||||||
if hasattr(data, "quality_str"):
|
if hasattr(data, "quality_bytes"):
|
||||||
quality = str2bytes(data.quality_str) # TODO quality_bytes property
|
quality = data.quality_bytes
|
||||||
elif hasattr(data, "quality"):
|
elif hasattr(data, "quality"):
|
||||||
quality = tobytes(data.quality)
|
quality = tobytes(data.quality)
|
||||||
else:
|
else:
|
||||||
raise AttributeError("No quality when exporting to fastq") # TODO discuss
|
raise AttributeError("No quality when exporting to fastq") # TODO discuss
|
||||||
|
|
||||||
return bytes2str(self.headerFormatter(data) +
|
return self.headerFormatter(data) + b"\n" + data[NUC_SEQUENCE_COLUMN] + b"\n+\n" + quality
|
||||||
b"\n" +
|
|
||||||
data[NUC_SEQUENCE_COLUMN] +
|
|
||||||
b"\n+\n" +
|
|
||||||
quality)
|
|
||||||
|
@ -2,6 +2,7 @@ cdef class HeaderFormat:
|
|||||||
|
|
||||||
cdef bytes start
|
cdef bytes start
|
||||||
cdef set tags
|
cdef set tags
|
||||||
cdef bint printNaKeys
|
cdef bint printNAKeys
|
||||||
|
cdef bytes NAString
|
||||||
cdef size_t headerBufferLength
|
cdef size_t headerBufferLength
|
||||||
|
|
@ -7,13 +7,14 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
|
|||||||
COUNT_COLUMN
|
COUNT_COLUMN
|
||||||
|
|
||||||
from obitools3.utils cimport str2bytes
|
from obitools3.utils cimport str2bytes
|
||||||
|
from obitools3.dms.column.column cimport Column_line
|
||||||
|
|
||||||
|
|
||||||
cdef class HeaderFormat:
|
cdef class HeaderFormat:
|
||||||
|
|
||||||
SPECIAL_KEYS = [NUC_SEQUENCE_COLUMN, ID_COLUMN, DEFINITION_COLUMN, QUALITY_COLUMN]
|
SPECIAL_KEYS = [NUC_SEQUENCE_COLUMN, ID_COLUMN, DEFINITION_COLUMN, QUALITY_COLUMN]
|
||||||
|
|
||||||
def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False):
|
def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
|
||||||
'''
|
'''
|
||||||
@param format:
|
@param format:
|
||||||
@type format: `str`
|
@type format: `str`
|
||||||
@ -23,10 +24,14 @@ cdef class HeaderFormat:
|
|||||||
|
|
||||||
@param printNAKeys:
|
@param printNAKeys:
|
||||||
@type printNAKeys: `bool`
|
@type printNAKeys: `bool`
|
||||||
|
|
||||||
|
@param NAString:
|
||||||
|
@type NAString: `bytes`
|
||||||
'''
|
'''
|
||||||
|
|
||||||
self.tags = set(tags)
|
self.tags = set(tags)
|
||||||
self.printNaKeys = printNAKeys
|
self.printNAKeys = printNAKeys
|
||||||
|
self.NAString = NAString
|
||||||
|
|
||||||
if format=="fasta":
|
if format=="fasta":
|
||||||
self.start=b">"
|
self.start=b">"
|
||||||
@ -43,7 +48,6 @@ cdef class HeaderFormat:
|
|||||||
cdef list lines = [b""]
|
cdef list lines = [b""]
|
||||||
cdef bytes tagline
|
cdef bytes tagline
|
||||||
|
|
||||||
|
|
||||||
if self.tags is not None and self.tags:
|
if self.tags is not None and self.tags:
|
||||||
ktags = self.tags
|
ktags = self.tags
|
||||||
else:
|
else:
|
||||||
@ -52,8 +56,16 @@ cdef class HeaderFormat:
|
|||||||
for k in ktags:
|
for k in ktags:
|
||||||
if k in tags:
|
if k in tags:
|
||||||
value = data[k]
|
value = data[k]
|
||||||
if value is not None or self.printNaKeys:
|
if value is None:
|
||||||
lines.append(k + b"=" + str2bytes(str(data[k]))) #TODO bytes() method on values (str equivalent)
|
if self.printNAKeys:
|
||||||
|
value = self.NAString
|
||||||
|
else:
|
||||||
|
if type(value) == Column_line:
|
||||||
|
value = value.bytes()
|
||||||
|
else:
|
||||||
|
value = str2bytes(str(value)) # TODO ugly but how else?
|
||||||
|
if value is not None:
|
||||||
|
lines.append(k + b"=" + value + b";")
|
||||||
|
|
||||||
if len(lines) > 1:
|
if len(lines) > 1:
|
||||||
tagline=b" ".join(lines)
|
tagline=b" ".join(lines)
|
||||||
|
Reference in New Issue
Block a user