From 2e191372d73acfd9ae5007c6b3a6233595efe080 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 30 Oct 2020 10:46:17 +0100 Subject: [PATCH] Now handling sequences with Uracil (U) nucleotides by converting to Thymine (T) --- src/encode.c | 6 ++++++ src/encode.h | 44 +++++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/src/encode.c b/src/encode.c index 085db80..c5232da 100755 --- a/src/encode.c +++ b/src/encode.c @@ -36,10 +36,12 @@ bool only_ATGC(const char* seq) { if (!((*c == 'A') || \ (*c == 'T') || \ + (*c == 'U') || \ (*c == 'G') || \ (*c == 'C') || \ (*c == 'a') || \ (*c == 't') || \ + (*c == 'u') || \ (*c == 'g') || \ (*c == 'c'))) { @@ -182,6 +184,8 @@ byte_t* encode_seq_on_2_bits(const char* seq, int32_t length) break; case 't': case 'T': + case 'u': + case 'U': seq_b[i/4] |= NUC_T_2b; break; default: @@ -288,6 +292,8 @@ byte_t* encode_seq_on_4_bits(const char* seq, int32_t length) break; case 't': case 'T': + case 'u': // discussable + case 'U': seq_b[i/2] |= NUC_T_4b; break; case 'r': diff --git a/src/encode.h b/src/encode.h index db8a214..2b58b8c 100755 --- a/src/encode.h +++ b/src/encode.h @@ -64,7 +64,7 @@ enum /** - * @brief Checks if there are only 'atgcATGC' characters in a + * @brief Checks if there are only 'atgcuATGCU' characters in a * character string. * * @param seq The sequence to check. @@ -129,12 +129,13 @@ byte_t get_nucleotide_from_encoded_seq(byte_t* seq, int32_t idx, uint8_t encodin /** * @brief Encodes a DNA sequence with each nucleotide coded on 2 bits. * - * A or a : 00 - * C or c : 01 - * T or t : 10 - * G or g : 11 + * A or a : 00 + * C or c : 01 + * T or t or U or u : 10 + * G or g : 11 * - * @warning The DNA sequence must contain only 'atgcATGC' characters. + * @warning The DNA sequence must contain only 'atgcuATGCU' characters. + * @warning Uracil ('U') bases are encoded as Thymine ('T') bases. * * @param seq The sequence to encode. * @param length The length of the sequence to encode. @@ -169,23 +170,24 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq); /** * @brief Encodes a DNA sequence with each nucleotide coded on 4 bits. * - * A or a : 0001 - * C or c : 0010 - * G or g : 0011 - * T or t : 0100 - * R or r : 0101 - * Y or y : 0110 - * S or s : 0111 - * W or w : 1000 - * K or k : 1001 - * M or m : 1010 - * B or b : 1011 - * D or d : 1100 - * H or h : 1101 - * V or v : 1110 - * N or n : 1111 + * A or a : 0001 + * C or c : 0010 + * G or g : 0011 + * T or t or U or u : 0100 + * R or r : 0101 + * Y or y : 0110 + * S or s : 0111 + * W or w : 1000 + * K or k : 1001 + * M or m : 1010 + * B or b : 1011 + * D or d : 1100 + * H or h : 1101 + * V or v : 1110 + * N or n : 1111 * * @warning The DNA sequence must contain only IUPAC characters. + * @warning Uracil ('U') bases are encoded as Thymine ('T') bases. * * @param seq The sequence to encode. * @param length The length of the sequence to encode.