C functions to detect IUPAC sequences
This commit is contained in:
@ -12,6 +12,8 @@ cdef extern from *:
|
||||
|
||||
cdef extern from "encode.h" nogil:
|
||||
bint only_ATGC(const_char_p seq)
|
||||
bint only_IUPAC_DNA(const_char_p seq)
|
||||
bint is_a_DNA_seq(const_char_p seq)
|
||||
|
||||
|
||||
cdef extern from "obitypes.h" nogil:
|
||||
|
60
src/encode.c
60
src/encode.c
@ -54,6 +54,66 @@ bool only_ATGC(const char* seq)
|
||||
}
|
||||
|
||||
|
||||
bool only_IUPAC_DNA(const char* seq)
|
||||
{
|
||||
const char* c = seq;
|
||||
|
||||
while (*c)
|
||||
{
|
||||
if (!((*c == 'A') || \
|
||||
(*c == 'T') || \
|
||||
(*c == 'G') || \
|
||||
(*c == 'C') || \
|
||||
(*c == 'U') || \
|
||||
(*c == 'R') || \
|
||||
(*c == 'Y') || \
|
||||
(*c == 'S') || \
|
||||
(*c == 'W') || \
|
||||
(*c == 'K') || \
|
||||
(*c == 'M') || \
|
||||
(*c == 'B') || \
|
||||
(*c == 'D') || \
|
||||
(*c == 'H') || \
|
||||
(*c == 'V') || \
|
||||
(*c == 'N') || \
|
||||
(*c == 'a') || \
|
||||
(*c == 't') || \
|
||||
(*c == 'g') || \
|
||||
(*c == 'c') || \
|
||||
(*c == 'u') || \
|
||||
(*c == 'r') || \
|
||||
(*c == 'y') || \
|
||||
(*c == 's') || \
|
||||
(*c == 'w') || \
|
||||
(*c == 'k') || \
|
||||
(*c == 'm') || \
|
||||
(*c == 'b') || \
|
||||
(*c == 'd') || \
|
||||
(*c == 'h') || \
|
||||
(*c == 'v') || \
|
||||
(*c == 'n') || \
|
||||
(*c == '.') || \
|
||||
(*c == '-')))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
c++;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
bool is_a_DNA_seq(const char* seq)
|
||||
{
|
||||
if (only_ATGC(seq))
|
||||
return true;
|
||||
return only_IUPAC_DNA(seq);
|
||||
}
|
||||
|
||||
|
||||
byte_t* encode_seq_on_2_bits(const char* seq, int32_t length)
|
||||
{
|
||||
byte_t* seq_b;
|
||||
|
30
src/encode.h
30
src/encode.h
@ -78,6 +78,36 @@ enum
|
||||
bool only_ATGC(const char* seq);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Checks if there are only IUPAC DNA characters in a
|
||||
* character string.
|
||||
*
|
||||
* @param seq The sequence to check.
|
||||
*
|
||||
* @returns A boolean value indicating if there are only
|
||||
* IUPAC DNA characters in a character string.
|
||||
*
|
||||
* @since May 2017
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
*/
|
||||
bool only_IUPAC_DNA(const char* seq);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Checks if a character string can be read as a DNA sequence encoded
|
||||
* with ACGT or IUPAC characters (in capital letters or not).
|
||||
*
|
||||
* @param seq The sequence to check.
|
||||
*
|
||||
* @returns A boolean value indicating if the character string
|
||||
* can be read as a DNA sequence.
|
||||
*
|
||||
* @since May 2017
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
*/
|
||||
bool is_a_DNA_seq(const char* seq);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Encodes a DNA sequence with each nucleotide coded on 2 bits.
|
||||
*
|
||||
|
Reference in New Issue
Block a user