Cython: fasta and fastq header formatter

This commit is contained in:
Celine Mercier
2018-10-09 16:41:00 +02:00
parent aa5ee53478
commit 8029493c10
2 changed files with 36 additions and 27 deletions

View File

@ -1,6 +1,6 @@
cdef class HeaderFormat:
cdef str start
cdef bytes start
cdef set tags
cdef bint printNaKeys
cdef size_t headerBufferLength

View File

@ -1,11 +1,22 @@
#cython: language_level=3
from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
ID_COLUMN, \
DEFINITION_COLUMN, \
QUALITY_COLUMN, \
COUNT_COLUMN
from obitools3.utils cimport str2bytes
cdef class HeaderFormat:
def __init__(self, bint fastaHeader=True, list tags=[], bint printNAKeys=False):
SPECIAL_KEYS = [NUC_SEQUENCE_COLUMN, ID_COLUMN, DEFINITION_COLUMN, QUALITY_COLUMN]
def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False):
'''
@param fastaHeader:
@type fastaHeader: `bool`
@param format:
@type format: `str`
@param tags:
@type tags: `list` of `bytes`
@ -17,44 +28,42 @@ cdef class HeaderFormat:
self.tags = set(tags)
self.printNaKeys = printNAKeys
if fastaHeader:
self.start=">"
else:
self.start="@"
if format=="fasta":
self.start=b">"
elif format=="fastq":
self.start=b"@"
self.headerBufferLength = 1000
#self.headerBuffer = []
def __call__(self, dict data):
cdef str header
cdef dict tags = data['tags']
def __call__(self, object data):
cdef bytes header
cdef list tags = [key for key in data if key not in self.SPECIAL_KEYS]
cdef set ktags
cdef list lines = [""]
cdef str tagline
cdef list lines = [b""]
cdef bytes tagline
if self.tags is not None and self.tags:
ktags = self.tags
else:
ktags = set(tags.keys())
ktags = set(tags)
for k in ktags:
if k in tags:
value = tags[k]
value = data[k]
if value is not None or self.printNaKeys:
lines.append("%s=%s;" % (k,tags[k]))
lines.append(k + b"=" + str2bytes(str(data[k]))) #TODO bytes() method on values (str equivalent)
if len(lines) > 1:
tagline=" ".join(lines)
tagline=b" ".join(lines)
else:
tagline=""
tagline=b""
if data['definition'] is not None:
header = "%s%s%s %s" % (self.start,data['id'],
tagline,
data['definition'])
if data[DEFINITION_COLUMN] is not None:
header = self.start + data[ID_COLUMN] + tagline + b" " + data[DEFINITION_COLUMN]
else:
header = "%s%s%s" % (self.start,data['id'],
tagline)
header = self.start + data[ID_COLUMN] + tagline
return header