From 96bf2daae8e975ba64d9fdec1ba01d63ef547a8f Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 12 Mar 2018 17:51:41 +0100 Subject: [PATCH] Cython API: added slices in Seq classes and fixes --- python/obitools3/dms/obiseq.pxd | 8 ++- python/obitools3/dms/obiseq.pyx | 121 +++++++++++++++++++++++--------- 2 files changed, 92 insertions(+), 37 deletions(-) diff --git a/python/obitools3/dms/obiseq.pxd b/python/obitools3/dms/obiseq.pxd index 3c6a4de..8c04f1c 100644 --- a/python/obitools3/dms/obiseq.pxd +++ b/python/obitools3/dms/obiseq.pxd @@ -8,19 +8,22 @@ cdef class Seq(dict) : cpdef object clone(self) cpdef str get_str(self) cpdef get_symbol_at(self, int pos) - + cpdef get_slice(self, slice slice_to_get) + cdef class Nuc_Seq(Seq) : cdef Nuc_Seq _reverse_complement cdef object _quality_array + cdef bint _revcomp cpdef set_quality(self, object new_quality, int offset=*) cpdef object build_quality_array(self, list quality) cpdef bytes build_reverse_complement(self) cdef class Seq_Stored(Line) : - pass + cpdef get_symbol_at(self, int pos) + cpdef get_slice(self, slice slice_to_get) cdef class Nuc_Seq_Stored(Seq_Stored) : @@ -29,7 +32,6 @@ cdef class Nuc_Seq_Stored(Seq_Stored) : cdef object _quality_array cdef bytes _seq cpdef set(self, object id, object seq, object definition=*, object quality=*, int offset=*, object tags=*) - cpdef get_symbol_at(self, int pos) cpdef set_quality_int(self, list new_qual) cpdef set_quality_char(self, object new_qual, int offset=*) cpdef object build_quality_array(self, list quality) diff --git a/python/obitools3/dms/obiseq.pyx b/python/obitools3/dms/obiseq.pyx index e7e7d92..beed690 100644 --- a/python/obitools3/dms/obiseq.pyx +++ b/python/obitools3/dms/obiseq.pyx @@ -4,7 +4,9 @@ from obitools3.utils cimport bytes2str, str2bytes, tostr, tobytes from obitools3.dms.view.view cimport View -from .capi.obitypes cimport index_t +from obitools3.dms.column.column cimport Column + +from .capi.obitypes cimport index_t, OBI_QUAL from .capi.obiview cimport NUC_SEQUENCE_COLUMN, \ ID_COLUMN, \ @@ -30,10 +32,9 @@ cdef class Seq(dict) : def __init__(self, object id, object seq, object definition=None, object tags=None) : cdef object k cdef bytes k_b - self[ID_COLUMN] = tobytes(id) - self[SEQUENCE_COLUMN] = tobytes(seq) - if definition is not None : - self.definition = tobytes(definition) + self.id = id + self.seq = seq + self.definition = definition if tags is not None : for k in tags: k_b = tobytes(k) @@ -56,6 +57,8 @@ cdef class Seq(dict) : def __getitem__(self, object ref): if type(ref) == int: return self.get_symbol_at(ref) + elif type(ref) == slice: + return self.get_slice(ref) else: return super(Seq, self).__getitem__(tobytes(ref)) @@ -77,6 +80,18 @@ cdef class Seq(dict) : seq = seq_class(self.id, self.seq, definition=self.definition, quality=self.quality, tags=self) return seq + cpdef object get_slice(self, slice slice_to_get): + cdef object new_seq + cdef list sliced_quality + cdef type seq_class + seq_class = type(self) + if QUALITY_COLUMN in self: + sliced_quality = self.quality[slice_to_get] + else: + sliced_quality = None + new_seq = seq_class(self.id+b"_SUB", self.seq[slice_to_get], definition=self.definition, quality=sliced_quality, tags=self) # TODO discuss suffix + return new_seq + cpdef get_symbol_at(self, int pos): return self.seq[pos:pos+1] @@ -105,7 +120,9 @@ cdef class Seq(dict) : @definition.setter def definition(self, object new_definition): # @DuplicatedSignature - self[DEFINITION_COLUMN] = tobytes(new_definition) + if new_definition is not None: + new_definition = tobytes(new_definition) + self[DEFINITION_COLUMN] = new_definition cdef class Nuc_Seq(Seq) : @@ -116,13 +133,14 @@ cdef class Nuc_Seq(Seq) : cdef int q cdef list q_proba_list - self[ID_COLUMN] = tobytes(id) - self[NUC_SEQUENCE_COLUMN] = tobytes(seq) - if definition is not None : - self.definition = tobytes(definition) + self.id = id + self.seq = seq + self.definition = definition if quality is not None: self.set_quality(quality, offset=offset) - + + self._revcomp = False + if tags is not None: for k in tags: k_b = tobytes(k) @@ -139,6 +157,15 @@ cdef class Nuc_Seq(Seq) : new_seq = Nuc_Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone) return new_seq + # revcomp property getter and setter (boolean indicating whether the sequence was created by reverse complementing another sequence) + @property + def revcomp(self): + return self._revcomp + + @revcomp.setter + def revcomp(self, bint revcomp): # @DuplicatedSignature + self._revcomp = revcomp + # nuc sequence property getter and setter @property def seq(self): @@ -151,7 +178,10 @@ cdef class Nuc_Seq(Seq) : # sequence quality property getter and setter @property def quality(self): - return self[QUALITY_COLUMN] + if QUALITY_COLUMN in self: + return self[QUALITY_COLUMN] + else: + return None cpdef set_quality(self, object new_quality, int offset=-1): cdef list quality_int @@ -169,7 +199,8 @@ cdef class Nuc_Seq(Seq) : @property def quality_array(self): if self._quality_array is None: - self._quality_array = self.build_quality_array(self[QUALITY_COLUMN]) + if QUALITY_COLUMN in self: + self._quality_array = self.build_quality_array(self[QUALITY_COLUMN]) return self._quality_array cpdef object build_quality_array(self, list quality): @@ -190,7 +221,8 @@ cdef class Nuc_Seq(Seq) : reversed_quality = self.quality[::-1] else: reversed_quality = None - seq = Nuc_Seq(self.id, rev_comp, definition=self.definition, quality=reversed_quality, tags=self) + seq = Nuc_Seq(self.id+b"_CMP", rev_comp, definition=self.definition, quality=reversed_quality, tags=self) + seq.revcomp = True self._reverse_complement = seq return self._reverse_complement @@ -202,6 +234,29 @@ cdef class Nuc_Seq(Seq) : cdef class Seq_Stored(Line) : + def __getitem__(self, object ref): + if type(ref) == int: + return self.get_symbol_at(ref) + elif type(ref) == slice: + return self.get_slice(ref) + else: + return super(Seq_Stored, self).__getitem__(ref) + + cpdef object get_slice(self, slice slice_to_get): + cdef object new_seq + cdef list sliced_quality + cdef type seq_class + seq_class = type(self) + if QUALITY_COLUMN in self: + sliced_quality = self.quality[slice_to_get] + else: + sliced_quality = None + new_seq = seq_class(self.id+b"_SUB", self.seq[slice_to_get], definition=self.definition, quality=sliced_quality, tags=self) # TODO discuss suffix + return new_seq + + cpdef get_symbol_at(self, int pos): + return self.seq[pos:pos+1] + # sequence id property getter and setter @property def id(self): # @ReservedAssignment @DuplicatedSignature @@ -243,7 +298,7 @@ cdef class Nuc_Seq_Stored(Seq_Stored) : self[ID_COLUMN] = tobytes(id) self[NUC_SEQUENCE_COLUMN] = tobytes(seq) - if definition is not None : + if definition is not None: self.definition = tobytes(definition) if quality is not None: if type(quality) == list: @@ -257,21 +312,11 @@ cdef class Nuc_Seq_Stored(Seq_Stored) : for k in tags: k_b = tobytes(k) if k_b not in SPECIAL_COLUMNS: - # TODO discuss convert value to bytes if str if type(tags[k]) == str: self[k_b] = str2bytes(tags[k]) else: self[k_b] = tags[k] - def __getitem__(self, object ref): - if type(ref) == int: - return self.get_symbol_at(ref) - else: - return super(Nuc_Seq_Stored, self).__getitem__(ref) - - cpdef get_symbol_at(self, int pos): - return self.seq[pos:pos+1] - # seq property getter and setter @property def seq(self): @@ -287,15 +332,22 @@ cdef class Nuc_Seq_Stored(Seq_Stored) : self._view.get_column(NUC_SEQUENCE_COLUMN).set_line(self.index, tobytes(new_seq)) cpdef set_quality_int(self, list new_qual): + if QUALITY_COLUMN not in self: + Column.new_column(self._view, QUALITY_COLUMN, OBI_QUAL) self._view.get_column(QUALITY_COLUMN).set_line(self.index, new_qual) cpdef set_quality_char(self, object new_qual, int offset=-1): + if QUALITY_COLUMN not in self: + Column.new_column(self._view, QUALITY_COLUMN, OBI_QUAL) self._view.get_column(QUALITY_COLUMN).set_str_line(self.index, tobytes(new_qual), offset=offset) # quality property getter and setter @property def quality(self): - return self._view.get_column(QUALITY_COLUMN).get_line(self.index) + if QUALITY_COLUMN in self: + return self._view.get_column(QUALITY_COLUMN).get_line(self.index) + else: + return None @quality.setter def quality(self, object new_qual): # @DuplicatedSignature @@ -311,13 +363,17 @@ cdef class Nuc_Seq_Stored(Seq_Stored) : # WARNING: default offset used @property def quality_str(self): - return self._view.get_column(QUALITY_COLUMN).get_str_line(self._index) + if QUALITY_COLUMN in self: + return self._view.get_column(QUALITY_COLUMN).get_str_line(self._index) + else: + return None # sequence quality array property getter and setter @property def quality_array(self): if self._quality_array is None: - self._quality_array = self.build_quality_array(self._view.get_column(QUALITY_COLUMN).get_line(self.index)) + if QUALITY_COLUMN in self: + self._quality_array = self.build_quality_array(self._view.get_column(QUALITY_COLUMN).get_line(self.index)) return self._quality_array cpdef object build_quality_array(self, list quality): @@ -331,14 +387,15 @@ cdef class Nuc_Seq_Stored(Seq_Stored) : @property def reverse_complement(self): cdef bytes rev_comp - cdef object seq_class + cdef list reversed_quality if self._reverse_complement is None: rev_comp = self.build_reverse_complement() if QUALITY_COLUMN in self: reversed_quality = self.quality[::-1] else: reversed_quality = None - seq = Nuc_Seq(self.id, rev_comp, definition=self.definition, quality=reversed_quality, tags=self) + seq = Nuc_Seq(self.id+b"_CMP", rev_comp, definition=self.definition, quality=reversed_quality, tags=self) + seq.revcomp = True self._reverse_complement = seq return self._reverse_complement @@ -356,7 +413,3 @@ cdef class Nuc_Seq_Stored(Seq_Stored) : def __len__(self): return len(self._view.get_column(NUC_SEQUENCE_COLUMN).get_line(self.index)) - - - - \ No newline at end of file