From 2e191372d73acfd9ae5007c6b3a6233595efe080 Mon Sep 17 00:00:00 2001
From: Celine Mercier <celine.mercier@metabarcoding.org>
Date: Fri, 30 Oct 2020 10:46:17 +0100
Subject: [PATCH] Now handling sequences with Uracil (U) nucleotides by
 converting to Thymine (T)

---
 src/encode.c |  6 ++++++
 src/encode.h | 44 +++++++++++++++++++++++---------------------
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/src/encode.c b/src/encode.c
index 085db80..c5232da 100755
--- a/src/encode.c
+++ b/src/encode.c
@@ -36,10 +36,12 @@ bool only_ATGC(const char* seq)
 	{
 		if (!((*c == 'A') || \
 			  (*c == 'T') || \
+			  (*c == 'U') || \
 			  (*c == 'G') || \
 			  (*c == 'C') || \
 			  (*c == 'a') || \
 			  (*c == 't') || \
+			  (*c == 'u') || \
 			  (*c == 'g') || \
 			  (*c == 'c')))
 		{
@@ -182,6 +184,8 @@ byte_t* encode_seq_on_2_bits(const char* seq, int32_t length)
 			break;
 		case 't':
 		case 'T':
+		case 'u':
+		case 'U':
 			seq_b[i/4] |= NUC_T_2b;
 			break;
 		default:
@@ -288,6 +292,8 @@ byte_t* encode_seq_on_4_bits(const char* seq, int32_t length)
 			break;
 		case 't':
 		case 'T':
+		case 'u': // discussable
+		case 'U':
 			seq_b[i/2] |= NUC_T_4b;
 			break;
 		case 'r':
diff --git a/src/encode.h b/src/encode.h
index db8a214..2b58b8c 100755
--- a/src/encode.h
+++ b/src/encode.h
@@ -64,7 +64,7 @@ enum
 
 
 /**
- * @brief Checks if there are only 'atgcATGC' characters in a
+ * @brief Checks if there are only 'atgcuATGCU' characters in a
  *        character string.
  *
  * @param seq The sequence to check.
@@ -129,12 +129,13 @@ byte_t get_nucleotide_from_encoded_seq(byte_t* seq, int32_t idx, uint8_t encodin
 /**
  * @brief Encodes a DNA sequence with each nucleotide coded on 2 bits.
  *
- *    A or a : 00
- *    C or c : 01
- *    T or t : 10
- *    G or g : 11
+ *    A or a :           00
+ *    C or c :           01
+ *    T or t or U or u : 10
+ *    G or g :           11
  *
- * @warning The DNA sequence must contain only 'atgcATGC' characters.
+ * @warning The DNA sequence must contain only 'atgcuATGCU' characters.
+ * @warning Uracil ('U') bases are encoded as Thymine ('T') bases.
  *
  * @param seq The sequence to encode.
  * @param length The length of the sequence to encode.
@@ -169,23 +170,24 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq);
 /**
  * @brief Encodes a DNA sequence with each nucleotide coded on 4 bits.
  *
- *		A or a : 0001
- *      C or c : 0010
- *      G or g : 0011
- *      T or t : 0100
- *      R or r : 0101
- *      Y or y : 0110
- *      S or s : 0111
- *      W or w : 1000
- *      K or k : 1001
- *      M or m : 1010
- *      B or b : 1011
- *      D or d : 1100
- *      H or h : 1101
- *      V or v : 1110
- *      N or n : 1111
+ *		A or a :           0001
+ *      C or c :           0010
+ *      G or g :           0011
+ *      T or t or U or u : 0100
+ *      R or r :           0101
+ *      Y or y :           0110
+ *      S or s :           0111
+ *      W or w :           1000
+ *      K or k :           1001
+ *      M or m :           1010
+ *      B or b :           1011
+ *      D or d :           1100
+ *      H or h :           1101
+ *      V or v :           1110
+ *      N or n :           1111
  *
  * @warning The DNA sequence must contain only IUPAC characters.
+ * @warning Uracil ('U') bases are encoded as Thymine ('T') bases.
  *
  * @param seq The sequence to encode.
  * @param length The length of the sequence to encode.