Implemented functions to build reverse complement sequences

This commit is contained in:
Celine Mercier
2018-01-05 16:08:36 +01:00
parent 428c4eb5e6
commit 156fb04e88
5 changed files with 303 additions and 25 deletions

View File

@ -9,4 +9,4 @@ from ..capi.obitypes cimport const_char_p
cdef extern from "utils.h" nogil:
const_char_p obi_format_date(time_t date)
char* reverse_complement_sequence(char* nucAcSeq)

View File

@ -8,9 +8,10 @@ cdef class Seq(dict) :
cdef class Nuc_Seq(Seq) :
cdef object _quality
#cpdef object reverse_complement(self)
cdef char* _reverse_complement
cdef object _quality_array
cpdef bytes build_reverse_complement(self)
cdef class Seq_Stored(Line) :
@ -18,7 +19,8 @@ cdef class Seq_Stored(Line) :
cdef class Nuc_Seq_Stored(Seq_Stored) :
cdef char* _reverse_complement
cpdef set_quality_int(self, list new_qual)
cpdef set_quality_char(self, bytes new_qual, int offset=*)
#cpdef object reverse_complement(self)
cpdef bytes build_reverse_complement(self)

View File

@ -2,12 +2,23 @@
from obitools3.utils cimport bytes2str, str2bytes, tostr, tobytes
from obitools3.dms.view.view cimport View
from .capi.obitypes cimport index_t
from .capi.obiview cimport NUC_SEQUENCE_COLUMN, \
ID_COLUMN, \
DEFINITION_COLUMN, \
QUALITY_COLUMN, \
COUNT_COLUMN
from .capi.obiutils cimport reverse_complement_sequence
from cpython cimport array
import array
from copy import deepcopy
cdef class Seq(dict) :
def __init__(self, object id, object seq, object definition=None, object quality=None, int offset=-1, object tags=None) :
@ -23,11 +34,18 @@ cdef class Seq(dict) :
k_b = tobytes(k)
self[k_b] = tags[k]
@staticmethod
def new_from_stored(Seq_Stored seq_to_clone) :
cdef Seq new_seq
new_seq = Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone)
return new_seq
def __contains__(self, object key):
return dict.__contains__(self, tobytes(key))
def __str__(self):
return tostr(self[NUC_SEQUENCE_COLUMN])
# sequence id property getter and setter
@property
def id(self): # @ReservedAssignment
@ -66,7 +84,31 @@ cdef class Seq(dict) :
cdef class Nuc_Seq(Seq) :
def __init__(self, object id, object seq, object definition=None, object quality=None, int offset=-1, object tags=None) :
cdef object k
cdef bytes k_b
self[ID_COLUMN] = tobytes(id)
self.seq = tobytes(seq)
if definition is not None :
self.definition = tobytes(definition)
if quality is not None:
self.quality = quality
if type(quality) == list: # TODO discuss
self._quality_array = array.array('d', quality)
self._annotations = {}
self._reverse_complement = NULL # TODO unnecessary?
if tags is not None:
for k in tags:
k_b = tobytes(k)
self[k_b] = tags[k]
@staticmethod
def new_from_stored(Nuc_Seq_Stored seq_to_clone) :
cdef Nuc_Seq new_seq
new_seq = Nuc_Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone)
return new_seq
# nuc sequence property getter and setter
@property
def seq(self):
@ -83,10 +125,41 @@ cdef class Nuc_Seq(Seq) :
@quality.setter
def quality(self, object new_quality): # @DuplicatedSignature
self[QUALITY_COLUMN] = tobytes(new_quality)
self[QUALITY_COLUMN] = new_quality
# sequence quality array property getter and setter
@property
def quality_array(self):
return self._quality_array
# reverse complement property getter
@property
def reverse_complement(self):
cdef bytes rev_comp
if self._reverse_complement == NULL:
rev_comp = self.build_reverse_complement()
self._reverse_complement = rev_comp
return self._reverse_complement
def __str__(self):
return tostr(self[NUC_SEQUENCE_COLUMN])
cpdef bytes build_reverse_complement(self) :
cdef bytes rev_comp
rev_comp = deepcopy(self[NUC_SEQUENCE_COLUMN]).lower()
return <bytes>reverse_complement_sequence(rev_comp)
# TODO DISCUSS
# sequence quality (character string format) property getter and setter
# @property
# def quality_str(self):
# return self[QUALITY_COLUMN]
#
# @quality.setter
# def quality_str(self, list new_quality): # @DuplicatedSignature
# self[QUALITY_COLUMN] = new_quality
# cpdef str reverse_complement(self) : TODO in C ?
# pass
cdef class Seq_Stored(Line) :
@ -112,19 +185,23 @@ cdef class Seq_Stored(Line) :
cdef class Nuc_Seq_Stored(Seq_Stored) :
def __init__(self, View view, index_t line_nb) :
self = Seq_Stored.__init__(self, view, line_nb)
self._reverse_complement = NULL # TODO unnecessary?
cpdef set_quality_int(self, list new_qual):
self._view.get_column(QUALITY_COLUMN).set_line(self.index, new_qual)
cpdef set_quality_char(self, bytes new_qual, int offset=-1):
self._view.get_column(QUALITY_COLUMN).set_str_line(self.index, new_qual, offset=offset)
# nuc_seq property getter and setter
# seq property getter and setter
@property
def nuc_seq(self):
def seq(self):
return self._view.get_column(NUC_SEQUENCE_COLUMN).get_line(self.index)
@nuc_seq.setter
def nuc_seq(self, object new_seq): # @DuplicatedSignature
@seq.setter
def seq(self, object new_seq): # @DuplicatedSignature
self._view.get_column(NUC_SEQUENCE_COLUMN).set_line(self.index, tobytes(new_seq))
# quality property getter and setter
@ -144,16 +221,29 @@ cdef class Nuc_Seq_Stored(Seq_Stored) :
# quality character string property getter
@property
def quality_str(self):
return self._view.get_column(QUALITY_COLUMN).get_str_line(self._index)
return self._view.get_column(QUALITY_COLUMN).get_str_line(self._index)
# reverse complement property getter
@property
def reverse_complement(self):
cdef bytes rev_comp
if self._reverse_complement == NULL:
rev_comp = self.build_reverse_complement()
self._reverse_complement = rev_comp
return self._reverse_complement
def __str__(self):
return bytes2str(self.nuc_seq)
return tostr(self._view.get_column(NUC_SEQUENCE_COLUMN).get_line(self.index))
def __len__(self):
return len(self._view.get_column(NUC_SEQUENCE_COLUMN).get_line(self.index))
cpdef bytes build_reverse_complement(self) :
cdef bytes rev_comp
rev_comp = deepcopy(self[NUC_SEQUENCE_COLUMN]).lower()
return <bytes>reverse_complement_sequence(rev_comp)
# cpdef str reverse_complement(self) : TODO in C ?
# pass
# TODO static method to import OBI_Nuc_Seq to OBI_Nuc_Seq_Stored ?

View File

@ -28,6 +28,140 @@
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
/**************************************************************************
*
* D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S
*
**************************************************************************/
/**
* Internal function returning the complement of a nucleotide base.
*
* @warning The base must be in lower case.
*
* @param nucAc The nucleotide base.
*
* @returns The complement of the nucleotide base.
* @retval The nucleotide base itself if no complement was found.
*
* @since December 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @note Copied from ecoPCR source code
*/
static char nuc_base_complement(char nucAc);
/**
* Internal function returning the complement of a nucleotide sequence.
*
* @warning The sequence must be in lower case.
* @warning The sequence will be replaced by its complement without being copied.
*
* @param nucAcSeq The nucleotide sequence.
*
* @returns The complemented sequence.
*
* @since December 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @note Copied from ecoPCR source code
*/
static char* nuc_seq_complement(char* nucAcSeq);
/**
* Internal function returning the reverse of a nucleotide sequence.
*
* @warning The sequence must be in lower case.
* @warning The sequence will be replaced by its reverse without being copied.
*
* @param str The nucleotide sequence.
* @param isPattern Whether the sequence is a pattern. TODO
*
* @returns The reversed sequence.
*
* @since December 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @note Copied from ecoPCR source code
*/
static char* reverse_sequence(char* str, char isPattern);
/************************************************************************
*
* D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S
*
************************************************************************/
static char nuc_base_complement(char nucAc)
{
char* c;
if ((c = strchr(DNA_ALPHA, nucAc)))
return CDNA_ALPHA[(c - DNA_ALPHA)];
else
return nucAc;
}
static char* nuc_seq_complement(char* nucAcSeq)
{
char *s;
for (s = nucAcSeq ; *s ; s++)
*s = nuc_base_complement(*s);
return nucAcSeq;
}
static char* reverse_sequence(char* str, char isPattern)
{
char *sb, *se, c;
if (! str)
return str;
sb = str;
se = str + strlen(str) - 1;
while(sb <= se) {
c = *sb;
*sb++ = *se;
*se-- = c;
}
sb = str;
se = str + strlen(str) - 1;
if (isPattern)
for (;sb < se; sb++)
{
if (*sb=='#')
{
if (((se - sb) > 2) && (*(sb+2)=='!'))
{
*sb='!';
sb+=2;
*sb='#';
}
else
{
*sb=*(sb+1);
sb++;
*sb='#';
}
}
else if (*sb=='!')
{
*sb=*(sb-1);
*(sb-1)='!';
}
}
return str;
}
/**********************************************************************
*
@ -404,3 +538,15 @@ loop: SWAPINIT(a, es);
/* qsort(pn - r, r / es, es, cmp);*/
}
char* reverse_complement_pattern(char* nucAcSeq)
{
return reverse_sequence(nuc_seq_complement(nucAcSeq), 1);
}
char* reverse_complement_sequence(char* nucAcSeq)
{
return reverse_sequence(nuc_seq_complement(nucAcSeq), 0);
}

View File

@ -20,10 +20,16 @@
#include "obitypes.h"
#define FORMATTED_TIME_LENGTH (1024) /**< The length allocated for the character string containing a formatted date.
*/
#define ONE_IF_ZERO(x) (((x)==0)?1:(x)) /**< If x is equal to 0, x takes the value 1.
*/
#define FORMATTED_TIME_LENGTH (1024) /**< The length allocated for the character string containing a formatted date.
*/
#define ONE_IF_ZERO(x) (((x)==0)?1:(x)) /**< If x is equal to 0, x takes the value 1.
*/
#define DNA_ALPHA "acgtbdefhijklmnopqrsuvwxyz#![]" /**< DNA alphabet (IUPAC).
//"ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]" */
#define CDNA_ALPHA "tgcavhefdijmlknopqysabwxrz#!][" /**< Complementary DNA alphabet (IUPAC).
//"TVGHEFCDIJMLKNOPQYSAABWXRZ#!][" */
/**
* @brief Copy the content of a file into another file.
@ -155,4 +161,38 @@ void* bsearch_user_data(const void* key, const void* base, size_t num, size_t si
void qsort_user_data(void *aa, size_t n, size_t es, const void *user_data, int (*cmp)(const void *, const void *, const void *));
/**
* Function returning the reverse complement of a nucleotide sequence.
*
* @warning The sequence must be in lower case.
* @warning The sequence will be replaced by its reverse complement without being copied.
*
* @param nucAcSeq The nucleotide sequence.
*
* @returns The reverse complemented sequence.
*
* @since December 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @note Copied from ecoPCR source code
*/
char* reverse_complement_sequence(char* nucAcSeq);
/**
* Function returning the reverse complement of a pattern.
*
* @warning The pattern must be in lower case.
* @warning The pattern will be replaced by its reverse complement without being copied.
*
* @param nucAcSeq The pattern.
*
* @returns The reverse complemented pattern.
*
* @since December 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @note Copied from ecoPCR source code
*/
char* reverse_complement_pattern(char* nucAcSeq);
#endif /* UTILS_H_ */