diff --git a/python/obitools3/dms/capi/obitypes.pxd b/python/obitools3/dms/capi/obitypes.pxd index efc2fb5..1f194f8 100644 --- a/python/obitools3/dms/capi/obitypes.pxd +++ b/python/obitools3/dms/capi/obitypes.pxd @@ -12,6 +12,8 @@ cdef extern from *: cdef extern from "encode.h" nogil: bint only_ATGC(const_char_p seq) + bint only_IUPAC_DNA(const_char_p seq) + bint is_a_DNA_seq(const_char_p seq) cdef extern from "obitypes.h" nogil: diff --git a/src/encode.c b/src/encode.c index 0611b8f..e9f4f29 100644 --- a/src/encode.c +++ b/src/encode.c @@ -54,6 +54,66 @@ bool only_ATGC(const char* seq) } +bool only_IUPAC_DNA(const char* seq) +{ + const char* c = seq; + + while (*c) + { + if (!((*c == 'A') || \ + (*c == 'T') || \ + (*c == 'G') || \ + (*c == 'C') || \ + (*c == 'U') || \ + (*c == 'R') || \ + (*c == 'Y') || \ + (*c == 'S') || \ + (*c == 'W') || \ + (*c == 'K') || \ + (*c == 'M') || \ + (*c == 'B') || \ + (*c == 'D') || \ + (*c == 'H') || \ + (*c == 'V') || \ + (*c == 'N') || \ + (*c == 'a') || \ + (*c == 't') || \ + (*c == 'g') || \ + (*c == 'c') || \ + (*c == 'u') || \ + (*c == 'r') || \ + (*c == 'y') || \ + (*c == 's') || \ + (*c == 'w') || \ + (*c == 'k') || \ + (*c == 'm') || \ + (*c == 'b') || \ + (*c == 'd') || \ + (*c == 'h') || \ + (*c == 'v') || \ + (*c == 'n') || \ + (*c == '.') || \ + (*c == '-'))) + { + return 0; + } + else + { + c++; + } + } + return 1; +} + + +bool is_a_DNA_seq(const char* seq) +{ + if (only_ATGC(seq)) + return true; + return only_IUPAC_DNA(seq); +} + + byte_t* encode_seq_on_2_bits(const char* seq, int32_t length) { byte_t* seq_b; diff --git a/src/encode.h b/src/encode.h index 5720dda..6acd906 100644 --- a/src/encode.h +++ b/src/encode.h @@ -78,6 +78,36 @@ enum bool only_ATGC(const char* seq); +/** + * @brief Checks if there are only IUPAC DNA characters in a + * character string. + * + * @param seq The sequence to check. + * + * @returns A boolean value indicating if there are only + * IUPAC DNA characters in a character string. + * + * @since May 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +bool only_IUPAC_DNA(const char* seq); + + +/** + * @brief Checks if a character string can be read as a DNA sequence encoded + * with ACGT or IUPAC characters (in capital letters or not). + * + * @param seq The sequence to check. + * + * @returns A boolean value indicating if the character string + * can be read as a DNA sequence. + * + * @since May 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +bool is_a_DNA_seq(const char* seq); + + /** * @brief Encodes a DNA sequence with each nucleotide coded on 2 bits. *