Cython: fasta and fastq parsers now return bytes and take NA string

argument
This commit is contained in:
Celine Mercier
2018-10-17 11:16:20 +02:00
parent 61b00d6013
commit e6bbe13d81
4 changed files with 34 additions and 23 deletions

View File

@ -7,10 +7,11 @@ from obitools3.utils cimport bytes2str
cdef class FastaFormat: cdef class FastaFormat:
def __init__(self, list tags=[], bint printNAKeys=False): def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
self.headerFormatter = HeaderFormat("fasta", self.headerFormatter = HeaderFormat("fasta",
tags, tags=tags,
printNAKeys) printNAKeys=printNAKeys,
NAString=NAString)
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, object data): def __call__(self, object data):
@ -24,5 +25,5 @@ cdef class FastaFormat:
brawseq = b'\n'.join(lines) brawseq = b'\n'.join(lines)
return bytes2str(self.headerFormatter(data) + b"\n" + brawseq) return self.headerFormatter(data) + b"\n" + brawseq

View File

@ -5,28 +5,25 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN
from obitools3.utils cimport bytes2str, str2bytes, tobytes from obitools3.utils cimport bytes2str, str2bytes, tobytes
# TODO quality offset option # TODO quality offset option?
cdef class FastqFormat: cdef class FastqFormat:
def __init__(self, list tags=[], bint printNAKeys=False): def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
self.headerFormatter = HeaderFormat("fastq", self.headerFormatter = HeaderFormat("fastq",
tags, tags=tags,
printNAKeys) printNAKeys=printNAKeys,
NAString=NAString)
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, object data): def __call__(self, object data):
cdef bytes quality cdef bytes quality
if hasattr(data, "quality_str"): if hasattr(data, "quality_bytes"):
quality = str2bytes(data.quality_str) # TODO quality_bytes property quality = data.quality_bytes
elif hasattr(data, "quality"): elif hasattr(data, "quality"):
quality = tobytes(data.quality) quality = tobytes(data.quality)
else: else:
raise AttributeError("No quality when exporting to fastq") # TODO discuss raise AttributeError("No quality when exporting to fastq") # TODO discuss
return bytes2str(self.headerFormatter(data) + return self.headerFormatter(data) + b"\n" + data[NUC_SEQUENCE_COLUMN] + b"\n+\n" + quality
b"\n" +
data[NUC_SEQUENCE_COLUMN] +
b"\n+\n" +
quality)

View File

@ -2,6 +2,7 @@ cdef class HeaderFormat:
cdef bytes start cdef bytes start
cdef set tags cdef set tags
cdef bint printNaKeys cdef bint printNAKeys
cdef bytes NAString
cdef size_t headerBufferLength cdef size_t headerBufferLength

View File

@ -7,13 +7,14 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
COUNT_COLUMN COUNT_COLUMN
from obitools3.utils cimport str2bytes from obitools3.utils cimport str2bytes
from obitools3.dms.column.column cimport Column_line
cdef class HeaderFormat: cdef class HeaderFormat:
SPECIAL_KEYS = [NUC_SEQUENCE_COLUMN, ID_COLUMN, DEFINITION_COLUMN, QUALITY_COLUMN] SPECIAL_KEYS = [NUC_SEQUENCE_COLUMN, ID_COLUMN, DEFINITION_COLUMN, QUALITY_COLUMN]
def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False): def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
''' '''
@param format: @param format:
@type format: `str` @type format: `str`
@ -23,10 +24,14 @@ cdef class HeaderFormat:
@param printNAKeys: @param printNAKeys:
@type printNAKeys: `bool` @type printNAKeys: `bool`
@param NAString:
@type NAString: `bytes`
''' '''
self.tags = set(tags) self.tags = set(tags)
self.printNaKeys = printNAKeys self.printNAKeys = printNAKeys
self.NAString = NAString
if format=="fasta": if format=="fasta":
self.start=b">" self.start=b">"
@ -43,7 +48,6 @@ cdef class HeaderFormat:
cdef list lines = [b""] cdef list lines = [b""]
cdef bytes tagline cdef bytes tagline
if self.tags is not None and self.tags: if self.tags is not None and self.tags:
ktags = self.tags ktags = self.tags
else: else:
@ -52,9 +56,17 @@ cdef class HeaderFormat:
for k in ktags: for k in ktags:
if k in tags: if k in tags:
value = data[k] value = data[k]
if value is not None or self.printNaKeys: if value is None:
lines.append(k + b"=" + str2bytes(str(data[k]))) #TODO bytes() method on values (str equivalent) if self.printNAKeys:
value = self.NAString
else:
if type(value) == Column_line:
value = value.bytes()
else:
value = str2bytes(str(value)) # TODO ugly but how else?
if value is not None:
lines.append(k + b"=" + value + b";")
if len(lines) > 1: if len(lines) > 1:
tagline=b" ".join(lines) tagline=b" ".join(lines)
else: else: