Cython API: added slices in Seq classes and fixes

This commit is contained in:
Celine Mercier
2018-03-12 17:51:41 +01:00
parent e6c49b7941
commit 96bf2daae8
2 changed files with 92 additions and 37 deletions

View File

@ -8,19 +8,22 @@ cdef class Seq(dict) :
cpdef object clone(self)
cpdef str get_str(self)
cpdef get_symbol_at(self, int pos)
cpdef get_slice(self, slice slice_to_get)
cdef class Nuc_Seq(Seq) :
cdef Nuc_Seq _reverse_complement
cdef object _quality_array
cdef bint _revcomp
cpdef set_quality(self, object new_quality, int offset=*)
cpdef object build_quality_array(self, list quality)
cpdef bytes build_reverse_complement(self)
cdef class Seq_Stored(Line) :
pass
cpdef get_symbol_at(self, int pos)
cpdef get_slice(self, slice slice_to_get)
cdef class Nuc_Seq_Stored(Seq_Stored) :
@ -29,7 +32,6 @@ cdef class Nuc_Seq_Stored(Seq_Stored) :
cdef object _quality_array
cdef bytes _seq
cpdef set(self, object id, object seq, object definition=*, object quality=*, int offset=*, object tags=*)
cpdef get_symbol_at(self, int pos)
cpdef set_quality_int(self, list new_qual)
cpdef set_quality_char(self, object new_qual, int offset=*)
cpdef object build_quality_array(self, list quality)

View File

@ -4,7 +4,9 @@ from obitools3.utils cimport bytes2str, str2bytes, tostr, tobytes
from obitools3.dms.view.view cimport View
from .capi.obitypes cimport index_t
from obitools3.dms.column.column cimport Column
from .capi.obitypes cimport index_t, OBI_QUAL
from .capi.obiview cimport NUC_SEQUENCE_COLUMN, \
ID_COLUMN, \
@ -30,10 +32,9 @@ cdef class Seq(dict) :
def __init__(self, object id, object seq, object definition=None, object tags=None) :
cdef object k
cdef bytes k_b
self[ID_COLUMN] = tobytes(id)
self[SEQUENCE_COLUMN] = tobytes(seq)
if definition is not None :
self.definition = tobytes(definition)
self.id = id
self.seq = seq
self.definition = definition
if tags is not None :
for k in tags:
k_b = tobytes(k)
@ -56,6 +57,8 @@ cdef class Seq(dict) :
def __getitem__(self, object ref):
if type(ref) == int:
return self.get_symbol_at(ref)
elif type(ref) == slice:
return self.get_slice(ref)
else:
return super(Seq, self).__getitem__(tobytes(ref))
@ -77,6 +80,18 @@ cdef class Seq(dict) :
seq = seq_class(self.id, self.seq, definition=self.definition, quality=self.quality, tags=self)
return seq
cpdef object get_slice(self, slice slice_to_get):
cdef object new_seq
cdef list sliced_quality
cdef type seq_class
seq_class = type(self)
if QUALITY_COLUMN in self:
sliced_quality = self.quality[slice_to_get]
else:
sliced_quality = None
new_seq = seq_class(self.id+b"_SUB", self.seq[slice_to_get], definition=self.definition, quality=sliced_quality, tags=self) # TODO discuss suffix
return new_seq
cpdef get_symbol_at(self, int pos):
return self.seq[pos:pos+1]
@ -105,7 +120,9 @@ cdef class Seq(dict) :
@definition.setter
def definition(self, object new_definition): # @DuplicatedSignature
self[DEFINITION_COLUMN] = tobytes(new_definition)
if new_definition is not None:
new_definition = tobytes(new_definition)
self[DEFINITION_COLUMN] = new_definition
cdef class Nuc_Seq(Seq) :
@ -116,13 +133,14 @@ cdef class Nuc_Seq(Seq) :
cdef int q
cdef list q_proba_list
self[ID_COLUMN] = tobytes(id)
self[NUC_SEQUENCE_COLUMN] = tobytes(seq)
if definition is not None :
self.definition = tobytes(definition)
self.id = id
self.seq = seq
self.definition = definition
if quality is not None:
self.set_quality(quality, offset=offset)
self._revcomp = False
if tags is not None:
for k in tags:
k_b = tobytes(k)
@ -139,6 +157,15 @@ cdef class Nuc_Seq(Seq) :
new_seq = Nuc_Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone)
return new_seq
# revcomp property getter and setter (boolean indicating whether the sequence was created by reverse complementing another sequence)
@property
def revcomp(self):
return self._revcomp
@revcomp.setter
def revcomp(self, bint revcomp): # @DuplicatedSignature
self._revcomp = revcomp
# nuc sequence property getter and setter
@property
def seq(self):
@ -151,7 +178,10 @@ cdef class Nuc_Seq(Seq) :
# sequence quality property getter and setter
@property
def quality(self):
return self[QUALITY_COLUMN]
if QUALITY_COLUMN in self:
return self[QUALITY_COLUMN]
else:
return None
cpdef set_quality(self, object new_quality, int offset=-1):
cdef list quality_int
@ -169,7 +199,8 @@ cdef class Nuc_Seq(Seq) :
@property
def quality_array(self):
if self._quality_array is None:
self._quality_array = self.build_quality_array(self[QUALITY_COLUMN])
if QUALITY_COLUMN in self:
self._quality_array = self.build_quality_array(self[QUALITY_COLUMN])
return self._quality_array
cpdef object build_quality_array(self, list quality):
@ -190,7 +221,8 @@ cdef class Nuc_Seq(Seq) :
reversed_quality = self.quality[::-1]
else:
reversed_quality = None
seq = Nuc_Seq(self.id, rev_comp, definition=self.definition, quality=reversed_quality, tags=self)
seq = Nuc_Seq(self.id+b"_CMP", rev_comp, definition=self.definition, quality=reversed_quality, tags=self)
seq.revcomp = True
self._reverse_complement = seq
return self._reverse_complement
@ -202,6 +234,29 @@ cdef class Nuc_Seq(Seq) :
cdef class Seq_Stored(Line) :
def __getitem__(self, object ref):
if type(ref) == int:
return self.get_symbol_at(ref)
elif type(ref) == slice:
return self.get_slice(ref)
else:
return super(Seq_Stored, self).__getitem__(ref)
cpdef object get_slice(self, slice slice_to_get):
cdef object new_seq
cdef list sliced_quality
cdef type seq_class
seq_class = type(self)
if QUALITY_COLUMN in self:
sliced_quality = self.quality[slice_to_get]
else:
sliced_quality = None
new_seq = seq_class(self.id+b"_SUB", self.seq[slice_to_get], definition=self.definition, quality=sliced_quality, tags=self) # TODO discuss suffix
return new_seq
cpdef get_symbol_at(self, int pos):
return self.seq[pos:pos+1]
# sequence id property getter and setter
@property
def id(self): # @ReservedAssignment @DuplicatedSignature
@ -243,7 +298,7 @@ cdef class Nuc_Seq_Stored(Seq_Stored) :
self[ID_COLUMN] = tobytes(id)
self[NUC_SEQUENCE_COLUMN] = tobytes(seq)
if definition is not None :
if definition is not None:
self.definition = tobytes(definition)
if quality is not None:
if type(quality) == list:
@ -257,21 +312,11 @@ cdef class Nuc_Seq_Stored(Seq_Stored) :
for k in tags:
k_b = tobytes(k)
if k_b not in SPECIAL_COLUMNS:
# TODO discuss convert value to bytes if str
if type(tags[k]) == str:
self[k_b] = str2bytes(tags[k])
else:
self[k_b] = tags[k]
def __getitem__(self, object ref):
if type(ref) == int:
return self.get_symbol_at(ref)
else:
return super(Nuc_Seq_Stored, self).__getitem__(ref)
cpdef get_symbol_at(self, int pos):
return self.seq[pos:pos+1]
# seq property getter and setter
@property
def seq(self):
@ -287,15 +332,22 @@ cdef class Nuc_Seq_Stored(Seq_Stored) :
self._view.get_column(NUC_SEQUENCE_COLUMN).set_line(self.index, tobytes(new_seq))
cpdef set_quality_int(self, list new_qual):
if QUALITY_COLUMN not in self:
Column.new_column(self._view, QUALITY_COLUMN, OBI_QUAL)
self._view.get_column(QUALITY_COLUMN).set_line(self.index, new_qual)
cpdef set_quality_char(self, object new_qual, int offset=-1):
if QUALITY_COLUMN not in self:
Column.new_column(self._view, QUALITY_COLUMN, OBI_QUAL)
self._view.get_column(QUALITY_COLUMN).set_str_line(self.index, tobytes(new_qual), offset=offset)
# quality property getter and setter
@property
def quality(self):
return self._view.get_column(QUALITY_COLUMN).get_line(self.index)
if QUALITY_COLUMN in self:
return self._view.get_column(QUALITY_COLUMN).get_line(self.index)
else:
return None
@quality.setter
def quality(self, object new_qual): # @DuplicatedSignature
@ -311,13 +363,17 @@ cdef class Nuc_Seq_Stored(Seq_Stored) :
# WARNING: default offset used
@property
def quality_str(self):
return self._view.get_column(QUALITY_COLUMN).get_str_line(self._index)
if QUALITY_COLUMN in self:
return self._view.get_column(QUALITY_COLUMN).get_str_line(self._index)
else:
return None
# sequence quality array property getter and setter
@property
def quality_array(self):
if self._quality_array is None:
self._quality_array = self.build_quality_array(self._view.get_column(QUALITY_COLUMN).get_line(self.index))
if QUALITY_COLUMN in self:
self._quality_array = self.build_quality_array(self._view.get_column(QUALITY_COLUMN).get_line(self.index))
return self._quality_array
cpdef object build_quality_array(self, list quality):
@ -331,14 +387,15 @@ cdef class Nuc_Seq_Stored(Seq_Stored) :
@property
def reverse_complement(self):
cdef bytes rev_comp
cdef object seq_class
cdef list reversed_quality
if self._reverse_complement is None:
rev_comp = self.build_reverse_complement()
if QUALITY_COLUMN in self:
reversed_quality = self.quality[::-1]
else:
reversed_quality = None
seq = Nuc_Seq(self.id, rev_comp, definition=self.definition, quality=reversed_quality, tags=self)
seq = Nuc_Seq(self.id+b"_CMP", rev_comp, definition=self.definition, quality=reversed_quality, tags=self)
seq.revcomp = True
self._reverse_complement = seq
return self._reverse_complement
@ -356,7 +413,3 @@ cdef class Nuc_Seq_Stored(Seq_Stored) :
def __len__(self):
return len(self._view.get_column(NUC_SEQUENCE_COLUMN).get_line(self.index))