From 156fb04e88ab490d92bfc70f6e9e1b217dff1ab5 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 5 Jan 2018 16:08:36 +0100 Subject: [PATCH] Implemented functions to build reverse complement sequences --- python/obitools3/dms/capi/obiutils.pxd | 2 +- python/obitools3/dms/obiseq.pxd | 12 +- python/obitools3/dms/obiseq.pyx | 120 +++++++++++++++++--- src/utils.c | 146 +++++++++++++++++++++++++ src/utils.h | 48 +++++++- 5 files changed, 303 insertions(+), 25 deletions(-) diff --git a/python/obitools3/dms/capi/obiutils.pxd b/python/obitools3/dms/capi/obiutils.pxd index 726ad0e..9341dff 100644 --- a/python/obitools3/dms/capi/obiutils.pxd +++ b/python/obitools3/dms/capi/obiutils.pxd @@ -9,4 +9,4 @@ from ..capi.obitypes cimport const_char_p cdef extern from "utils.h" nogil: const_char_p obi_format_date(time_t date) - + char* reverse_complement_sequence(char* nucAcSeq) diff --git a/python/obitools3/dms/obiseq.pxd b/python/obitools3/dms/obiseq.pxd index 62fdbcb..b1bd163 100644 --- a/python/obitools3/dms/obiseq.pxd +++ b/python/obitools3/dms/obiseq.pxd @@ -8,9 +8,10 @@ cdef class Seq(dict) : cdef class Nuc_Seq(Seq) : - - cdef object _quality - #cpdef object reverse_complement(self) + + cdef char* _reverse_complement + cdef object _quality_array + cpdef bytes build_reverse_complement(self) cdef class Seq_Stored(Line) : @@ -18,7 +19,8 @@ cdef class Seq_Stored(Line) : cdef class Nuc_Seq_Stored(Seq_Stored) : - + + cdef char* _reverse_complement cpdef set_quality_int(self, list new_qual) cpdef set_quality_char(self, bytes new_qual, int offset=*) - #cpdef object reverse_complement(self) \ No newline at end of file + cpdef bytes build_reverse_complement(self) \ No newline at end of file diff --git a/python/obitools3/dms/obiseq.pyx b/python/obitools3/dms/obiseq.pyx index 5ee1d89..1d5984e 100644 --- a/python/obitools3/dms/obiseq.pyx +++ b/python/obitools3/dms/obiseq.pyx @@ -2,12 +2,23 @@ from obitools3.utils cimport bytes2str, str2bytes, tostr, tobytes +from obitools3.dms.view.view cimport View + +from .capi.obitypes cimport index_t + from .capi.obiview cimport NUC_SEQUENCE_COLUMN, \ ID_COLUMN, \ DEFINITION_COLUMN, \ QUALITY_COLUMN, \ COUNT_COLUMN +from .capi.obiutils cimport reverse_complement_sequence + +from cpython cimport array +import array + +from copy import deepcopy + cdef class Seq(dict) : def __init__(self, object id, object seq, object definition=None, object quality=None, int offset=-1, object tags=None) : @@ -23,11 +34,18 @@ cdef class Seq(dict) : k_b = tobytes(k) self[k_b] = tags[k] + @staticmethod + def new_from_stored(Seq_Stored seq_to_clone) : + cdef Seq new_seq + new_seq = Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone) + return new_seq def __contains__(self, object key): return dict.__contains__(self, tobytes(key)) - + def __str__(self): + return tostr(self[NUC_SEQUENCE_COLUMN]) + # sequence id property getter and setter @property def id(self): # @ReservedAssignment @@ -66,7 +84,31 @@ cdef class Seq(dict) : cdef class Nuc_Seq(Seq) : - + + def __init__(self, object id, object seq, object definition=None, object quality=None, int offset=-1, object tags=None) : + cdef object k + cdef bytes k_b + self[ID_COLUMN] = tobytes(id) + self.seq = tobytes(seq) + if definition is not None : + self.definition = tobytes(definition) + if quality is not None: + self.quality = quality + if type(quality) == list: # TODO discuss + self._quality_array = array.array('d', quality) + self._annotations = {} + self._reverse_complement = NULL # TODO unnecessary? + if tags is not None: + for k in tags: + k_b = tobytes(k) + self[k_b] = tags[k] + + @staticmethod + def new_from_stored(Nuc_Seq_Stored seq_to_clone) : + cdef Nuc_Seq new_seq + new_seq = Nuc_Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone) + return new_seq + # nuc sequence property getter and setter @property def seq(self): @@ -83,10 +125,41 @@ cdef class Nuc_Seq(Seq) : @quality.setter def quality(self, object new_quality): # @DuplicatedSignature - self[QUALITY_COLUMN] = tobytes(new_quality) + self[QUALITY_COLUMN] = new_quality + + # sequence quality array property getter and setter + @property + def quality_array(self): + return self._quality_array + + # reverse complement property getter + @property + def reverse_complement(self): + cdef bytes rev_comp + if self._reverse_complement == NULL: + rev_comp = self.build_reverse_complement() + self._reverse_complement = rev_comp + return self._reverse_complement + + def __str__(self): + return tostr(self[NUC_SEQUENCE_COLUMN]) + + cpdef bytes build_reverse_complement(self) : + cdef bytes rev_comp + rev_comp = deepcopy(self[NUC_SEQUENCE_COLUMN]).lower() + return reverse_complement_sequence(rev_comp) + + +# TODO DISCUSS + # sequence quality (character string format) property getter and setter +# @property +# def quality_str(self): +# return self[QUALITY_COLUMN] +# +# @quality.setter +# def quality_str(self, list new_quality): # @DuplicatedSignature +# self[QUALITY_COLUMN] = new_quality -# cpdef str reverse_complement(self) : TODO in C ? -# pass cdef class Seq_Stored(Line) : @@ -112,19 +185,23 @@ cdef class Seq_Stored(Line) : cdef class Nuc_Seq_Stored(Seq_Stored) : + def __init__(self, View view, index_t line_nb) : + self = Seq_Stored.__init__(self, view, line_nb) + self._reverse_complement = NULL # TODO unnecessary? + cpdef set_quality_int(self, list new_qual): self._view.get_column(QUALITY_COLUMN).set_line(self.index, new_qual) cpdef set_quality_char(self, bytes new_qual, int offset=-1): self._view.get_column(QUALITY_COLUMN).set_str_line(self.index, new_qual, offset=offset) - # nuc_seq property getter and setter + # seq property getter and setter @property - def nuc_seq(self): + def seq(self): return self._view.get_column(NUC_SEQUENCE_COLUMN).get_line(self.index) - @nuc_seq.setter - def nuc_seq(self, object new_seq): # @DuplicatedSignature + @seq.setter + def seq(self, object new_seq): # @DuplicatedSignature self._view.get_column(NUC_SEQUENCE_COLUMN).set_line(self.index, tobytes(new_seq)) # quality property getter and setter @@ -144,16 +221,29 @@ cdef class Nuc_Seq_Stored(Seq_Stored) : # quality character string property getter @property def quality_str(self): - return self._view.get_column(QUALITY_COLUMN).get_str_line(self._index) - + return self._view.get_column(QUALITY_COLUMN).get_str_line(self._index) + + # reverse complement property getter + @property + def reverse_complement(self): + cdef bytes rev_comp + if self._reverse_complement == NULL: + rev_comp = self.build_reverse_complement() + self._reverse_complement = rev_comp + return self._reverse_complement def __str__(self): - return bytes2str(self.nuc_seq) + return tostr(self._view.get_column(NUC_SEQUENCE_COLUMN).get_line(self.index)) + + def __len__(self): + return len(self._view.get_column(NUC_SEQUENCE_COLUMN).get_line(self.index)) + + cpdef bytes build_reverse_complement(self) : + cdef bytes rev_comp + rev_comp = deepcopy(self[NUC_SEQUENCE_COLUMN]).lower() + return reverse_complement_sequence(rev_comp) -# cpdef str reverse_complement(self) : TODO in C ? -# pass - # TODO static method to import OBI_Nuc_Seq to OBI_Nuc_Seq_Stored ? \ No newline at end of file diff --git a/src/utils.c b/src/utils.c index 564e2d8..f115267 100644 --- a/src/utils.c +++ b/src/utils.c @@ -28,6 +28,140 @@ #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) +/************************************************************************** + * + * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S + * + **************************************************************************/ + + +/** + * Internal function returning the complement of a nucleotide base. + * + * @warning The base must be in lower case. + * + * @param nucAc The nucleotide base. + * + * @returns The complement of the nucleotide base. + * @retval The nucleotide base itself if no complement was found. + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @note Copied from ecoPCR source code + */ +static char nuc_base_complement(char nucAc); + + +/** + * Internal function returning the complement of a nucleotide sequence. + * + * @warning The sequence must be in lower case. + * @warning The sequence will be replaced by its complement without being copied. + * + * @param nucAcSeq The nucleotide sequence. + * + * @returns The complemented sequence. + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @note Copied from ecoPCR source code + */ +static char* nuc_seq_complement(char* nucAcSeq); + + +/** + * Internal function returning the reverse of a nucleotide sequence. + * + * @warning The sequence must be in lower case. + * @warning The sequence will be replaced by its reverse without being copied. + * + * @param str The nucleotide sequence. + * @param isPattern Whether the sequence is a pattern. TODO + * + * @returns The reversed sequence. + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @note Copied from ecoPCR source code + */ +static char* reverse_sequence(char* str, char isPattern); + + +/************************************************************************ + * + * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S + * + ************************************************************************/ + +static char nuc_base_complement(char nucAc) +{ + char* c; + + if ((c = strchr(DNA_ALPHA, nucAc))) + return CDNA_ALPHA[(c - DNA_ALPHA)]; + else + return nucAc; +} + + +static char* nuc_seq_complement(char* nucAcSeq) +{ + char *s; + + for (s = nucAcSeq ; *s ; s++) + *s = nuc_base_complement(*s); + + return nucAcSeq; +} + + +static char* reverse_sequence(char* str, char isPattern) +{ + char *sb, *se, c; + + if (! str) + return str; + + sb = str; + se = str + strlen(str) - 1; + + while(sb <= se) { + c = *sb; + *sb++ = *se; + *se-- = c; + } + + sb = str; + se = str + strlen(str) - 1; + + if (isPattern) + for (;sb < se; sb++) + { + if (*sb=='#') + { + if (((se - sb) > 2) && (*(sb+2)=='!')) + { + *sb='!'; + sb+=2; + *sb='#'; + } + else + { + *sb=*(sb+1); + sb++; + *sb='#'; + } + } + else if (*sb=='!') + { + *sb=*(sb-1); + *(sb-1)='!'; + } + } + + return str; +} + /********************************************************************** * @@ -404,3 +538,15 @@ loop: SWAPINIT(a, es); /* qsort(pn - r, r / es, es, cmp);*/ } + +char* reverse_complement_pattern(char* nucAcSeq) +{ + return reverse_sequence(nuc_seq_complement(nucAcSeq), 1); +} + + +char* reverse_complement_sequence(char* nucAcSeq) +{ + return reverse_sequence(nuc_seq_complement(nucAcSeq), 0); +} + diff --git a/src/utils.h b/src/utils.h index f0b8d69..c1156e9 100644 --- a/src/utils.h +++ b/src/utils.h @@ -20,10 +20,16 @@ #include "obitypes.h" -#define FORMATTED_TIME_LENGTH (1024) /**< The length allocated for the character string containing a formatted date. - */ -#define ONE_IF_ZERO(x) (((x)==0)?1:(x)) /**< If x is equal to 0, x takes the value 1. - */ +#define FORMATTED_TIME_LENGTH (1024) /**< The length allocated for the character string containing a formatted date. + */ +#define ONE_IF_ZERO(x) (((x)==0)?1:(x)) /**< If x is equal to 0, x takes the value 1. + */ +#define DNA_ALPHA "acgtbdefhijklmnopqrsuvwxyz#![]" /**< DNA alphabet (IUPAC). + //"ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]" */ +#define CDNA_ALPHA "tgcavhefdijmlknopqysabwxrz#!][" /**< Complementary DNA alphabet (IUPAC). + //"TVGHEFCDIJMLKNOPQYSAABWXRZ#!][" */ + + /** * @brief Copy the content of a file into another file. @@ -155,4 +161,38 @@ void* bsearch_user_data(const void* key, const void* base, size_t num, size_t si void qsort_user_data(void *aa, size_t n, size_t es, const void *user_data, int (*cmp)(const void *, const void *, const void *)); +/** + * Function returning the reverse complement of a nucleotide sequence. + * + * @warning The sequence must be in lower case. + * @warning The sequence will be replaced by its reverse complement without being copied. + * + * @param nucAcSeq The nucleotide sequence. + * + * @returns The reverse complemented sequence. + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @note Copied from ecoPCR source code + */ +char* reverse_complement_sequence(char* nucAcSeq); + + +/** + * Function returning the reverse complement of a pattern. + * + * @warning The pattern must be in lower case. + * @warning The pattern will be replaced by its reverse complement without being copied. + * + * @param nucAcSeq The pattern. + * + * @returns The reverse complemented pattern. + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @note Copied from ecoPCR source code + */ +char* reverse_complement_pattern(char* nucAcSeq); + + #endif /* UTILS_H_ */