DNA sequences are now encoded on 4 bits when they are in IUPAC

2015-11-20 15:32:09 +01:00
parent 87044b41d8
commit 6aa2f92930
5 changed files with 318 additions and 54 deletions
--- a/python/obitools3/unit_tests.py
+++ b/python/obitools3/unit_tests.py
@ -65,7 +65,7 @@ def random_obivalue(data_type):
        return randoms
    elif data_type == "OBI_SEQ" :
        length = randint(1,200)
-        randoms = ''.join(choice("atgc") for i in range(length))
+        randoms = ''.join(choice("atgcryswkmdbhvn") for i in range(length))
        return randoms

 class OBIDMS_Column_TestCase(unittest.TestCase):
--- a/src/encode.c
+++ b/src/encode.c
@ -58,12 +58,9 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
 	byte_t*  seq_b;
 	uint8_t  modulo;
 	int32_t  length_b;
-	int32_t   i;
-
-//	fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>Encoding sequence %s", seq);
+	int32_t  i;

 	length_b = ceil((double) length / (double) 4.0);
-//	fprintf(stderr, "\nLength: %d", length_b);

 	seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));

@ -80,27 +77,19 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
 		{
 		case 'a':
 		case 'A':
-			seq_b[i/4] |= NUC_A;
-//			fprintf(stderr, "\nIn byte %d, writing A:", i/4);
-//			print_bits(seq_b, length_b);
+			seq_b[i/4] |= NUC_A_2b;
 			break;
 		case 'c':
 		case 'C':
-			seq_b[i/4] |= NUC_C;
-//			fprintf(stderr, "\nIn byte %d, writing C:", i/4);
-//			print_bits(seq_b, length_b);
+			seq_b[i/4] |= NUC_C_2b;
 			break;
 		case 'g':
 		case 'G':
-			seq_b[i/4] |= NUC_G;
-//			fprintf(stderr, "\nIn byte %d, writing G:", i/4);
-//			print_bits(seq_b, length_b);
+			seq_b[i/4] |= NUC_G_2b;
 			break;
 		case 't':
 		case 'T':
-			seq_b[i/4] |= NUC_T;
-//			fprintf(stderr, "\nIn byte %d, writing T:", i/4);
-//			print_bits(seq_b, length_b);
+			seq_b[i/4] |= NUC_T_2b;
 			break;
 		default:
 			obidebug(1, "\nInvalid nucleotide base when encoding (not [atgcATGC])");
@ -113,9 +102,6 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
 	if (modulo)
 		seq_b[(i-1)/4] <<= (2*(4 - modulo));

-//	fprintf(stderr, "\n>>>>>>>>>Encoded:");
-//	print_bits(seq_b, length_b);
-
 	return seq_b;
 }

@ -123,7 +109,7 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
 char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
 {
 	char*   seq;
-	int32_t  i;
+	int32_t i;
 	uint8_t shift;
 	uint8_t mask;
 	uint8_t nuc;
@ -133,21 +119,21 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
 	for (i=0; i<length_seq; i++)
 	{
 		shift = 6 - 2*(i % 4);
-		mask = NUC_MASK << shift;
+		mask = NUC_MASK_2B << shift;
 		nuc = (seq_b[i/4] & mask) >> shift;

 		switch (nuc)
 		{
-		case NUC_A:
+		case NUC_A_2b:
 			seq[i] = 'a';
 			break;
-		case NUC_C:
+		case NUC_C_2b:
 			seq[i] = 'c';
 			break;
-		case NUC_G:
+		case NUC_G_2b:
 			seq[i] = 'g';
 			break;
-		case NUC_T:
+		case NUC_T_2b:
 			seq[i] = 't';
 			break;
 		default:
@ -162,6 +148,178 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
 }


+byte_t* encode_seq_on_4_bits(char* seq, int32_t length)
+{
+	byte_t*  seq_b;
+	uint8_t  modulo;
+	int32_t  length_b;
+	int32_t  i;
+
+	length_b = ceil((double) length / (double) 2.0);
+
+	seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));
+
+	// Initialize all the bits to 0
+	memset(seq_b, 0, length_b);
+
+	for (i=0; i<length; i++)
+	{
+		// Shift of 4 to make place for new nucleotide
+		seq_b[i/2] <<= 4;
+
+		// Add new nucleotide
+		switch (seq[i])
+		{
+		case 'a':
+		case 'A':
+			seq_b[i/2] |= NUC_A_4b;
+			break;
+		case 'c':
+		case 'C':
+			seq_b[i/2] |= NUC_C_4b;
+			break;
+		case 'g':
+		case 'G':
+			seq_b[i/2] |= NUC_G_4b;
+			break;
+		case 't':
+		case 'T':
+			seq_b[i/2] |= NUC_T_4b;
+			break;
+		case 'r':
+		case 'R':
+			seq_b[i/2] |= NUC_R_4b;
+			break;
+		case 'y':
+		case 'Y':
+			seq_b[i/2] |= NUC_Y_4b;
+			break;
+		case 's':
+		case 'S':
+			seq_b[i/2] |= NUC_S_4b;
+			break;
+		case 'w':
+		case 'W':
+			seq_b[i/2] |= NUC_W_4b;
+			break;
+		case 'k':
+		case 'K':
+			seq_b[i/2] |= NUC_K_4b;
+			break;
+		case 'm':
+		case 'M':
+			seq_b[i/2] |= NUC_M_4b;
+			break;
+		case 'b':
+		case 'B':
+			seq_b[i/2] |= NUC_B_4b;
+			break;
+		case 'd':
+		case 'D':
+			seq_b[i/2] |= NUC_D_4b;
+			break;
+		case 'h':
+		case 'H':
+			seq_b[i/2] |= NUC_H_4b;
+			break;
+		case 'v':
+		case 'V':
+			seq_b[i/2] |= NUC_V_4b;
+			break;
+		case 'n':
+		case 'N':
+			seq_b[i/2] |= NUC_N_4b;
+			break;
+		default:
+			obidebug(1, "\nInvalid nucleotide base when encoding (not IUPAC)");
+			return NULL;
+		}
+	}
+
+	// Final shift for the last byte if needed
+	modulo = (length % 2);
+	if (modulo)
+		seq_b[(i-1)/2] <<= (4*modulo);
+
+	return seq_b;
+}
+
+
+char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq)
+{
+	char*   seq;
+	int32_t i;
+	uint8_t shift;
+	uint8_t mask;
+	uint8_t nuc;
+
+	seq = (char*) malloc((length_seq+1) * sizeof(char));
+
+	for (i=0; i<length_seq; i++)
+	{
+		shift = 4 - 4*(i % 2);
+		mask = NUC_MASK_4B << shift;
+		nuc = (seq_b[i/2] & mask) >> shift;
+
+		switch (nuc)
+		{
+		case NUC_A_4b:
+			seq[i] = 'a';
+			break;
+		case NUC_C_4b:
+			seq[i] = 'c';
+			break;
+		case NUC_G_4b:
+			seq[i] = 'g';
+			break;
+		case NUC_T_4b:
+			seq[i] = 't';
+			break;
+		case NUC_R_4b:
+			seq[i] = 'r';
+			break;
+		case NUC_Y_4b:
+			seq[i] = 'y';
+			break;
+		case NUC_S_4b:
+			seq[i] = 's';
+			break;
+		case NUC_W_4b:
+			seq[i] = 'w';
+			break;
+		case NUC_K_4b:
+			seq[i] = 'k';
+			break;
+		case NUC_M_4b:
+			seq[i] = 'm';
+			break;
+		case NUC_B_4b:
+			seq[i] = 'b';
+			break;
+		case NUC_D_4b:
+			seq[i] = 'd';
+			break;
+		case NUC_H_4b:
+			seq[i] = 'h';
+			break;
+		case NUC_V_4b:
+			seq[i] = 'v';
+			break;
+		case NUC_N_4b:
+			seq[i] = 'n';
+			break;
+		default:
+			obidebug(1, "\nInvalid nucleotide base when decoding");
+			return NULL;
+		}
+	}
+
+	seq[length_seq] = '\0';
+
+	return seq;
+}
+
+
 ////////// FOR DEBUGGING ///////////

 // little endian
--- a/src/encode.h
+++ b/src/encode.h
@ -18,7 +18,8 @@
 #include "obiarray.h"


-#define NUC_MASK 0x3   /**< Binary: 11 to use when decoding */
+#define NUC_MASK_2B 0x3   /**< Binary: 11 to use when decoding 2 bits sequences */
+#define NUC_MASK_4B 0xF  /**< Binary: 1111 to use when decoding 4 bits sequences */


 /**
@ -26,10 +27,33 @@
 */
 enum
 {
-    NUC_A = 0x0,   /* binary: 00 */
-    NUC_C = 0x1,   /* binary: 01 */
-    NUC_G = 0x2,   /* binary: 10 */
-    NUC_T = 0x3,   /* binary: 11 */
+    NUC_A_2b = 0x0,   /* binary: 00 */
+    NUC_C_2b = 0x1,   /* binary: 01 */
+    NUC_G_2b = 0x2,   /* binary: 10 */
+    NUC_T_2b = 0x3,   /* binary: 11 */
+};
+
+
+/**
+ * @brief enum for the 4-bits codes for each of the 15 IUPAC nucleotides.
+ */
+enum
+{
+    NUC_A_4b = 0x1,   /* binary: 0001 */
+    NUC_C_4b = 0x2,   /* binary: 0010 */
+    NUC_G_4b = 0x3,   /* binary: 0011 */
+    NUC_T_4b = 0x4,   /* binary: 0100 */
+	NUC_R_4b = 0x5,   /* binary: 0101 */
+	NUC_Y_4b = 0x6,   /* binary: 0110 */
+	NUC_S_4b = 0x7,   /* binary: 0111 */
+	NUC_W_4b = 0x8,   /* binary: 1000 */
+	NUC_K_4b = 0x9,   /* binary: 1001 */
+	NUC_M_4b = 0xA,   /* binary: 1010 */
+	NUC_B_4b = 0xB,   /* binary: 1011 */
+	NUC_D_4b = 0xC,   /* binary: 1100 */
+	NUC_H_4b = 0xD,   /* binary: 1101 */
+	NUC_V_4b = 0xE,   /* binary: 1110 */
+	NUC_N_4b = 0xF,   /* binary: 1111 */
 };


@ -72,10 +96,10 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length);
 /**
 * @brief Decodes a DNA sequence that is coded with each nucleotide on 2 bits.
 *
- *    A or a : 00
- *    C or c : 01
- *    T or t : 10
- *    G or g : 11
+ *    00 -> a
+ *    01 -> c
+ *    10 -> t
+ *    11 -> g
 *
 * @param seq The sequence to decode.
 * @param length_seq The initial length of the sequence before it was encoded.
@ -88,6 +112,68 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length);
 char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq);


+/**
+ * @brief Encodes a DNA sequence with each nucleotide coded on 4 bits.
+ *
+ *		A or a : 0001
+ *      C or c : 0010
+ *      G or g : 0011
+ *      T or t : 0100
+ *      R or r : 0101
+ *      Y or y : 0110
+ *      S or s : 0111
+ *      W or w : 1000
+ *      K or k : 1001
+ *      M or m : 1010
+ *      B or b : 1011
+ *      D or d : 1100
+ *      H or h : 1101
+ *      V or v : 1110
+ *      N or n : 1111
+ *
+ * @warning The DNA sequence must contain only IUPAC characters.
+ *
+ * @param seq The sequence to encode.
+ * @param length The length of the sequence to encode.
+ *
+ * @returns The encoded sequence.
+ *
+ * @since November 2015
+ * @author Celine Mercier (celine.mercier@metabarcoding.org)
+ */
+byte_t* encode_seq_on_4_bits(char* seq, int32_t length);
+
+
+/**
+ * @brief Decodes a DNA sequence that is coded with each nucleotide on 4 bits.
+ *
+ *		A or a : 0001
+ *      C or c : 0010
+ *      G or g : 0011
+ *      T or t : 0100
+ *      R or r : 0101
+ *      Y or y : 0110
+ *      S or s : 0111
+ *      W or w : 1000
+ *      K or k : 1001
+ *      M or m : 1010
+ *      B or b : 1011
+ *      D or d : 1100
+ *      H or h : 1101
+ *      V or v : 1110
+ *      N or n : 1111
+ *
+ * @param seq The sequence to decode.
+ * @param length_seq The initial length of the sequence before it was encoded.
+ *
+ * @returns The decoded sequence ended with '\0'.
+ *
+ * @since November 2015
+ * @author Celine Mercier (celine.mercier@metabarcoding.org)
+ */
+char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq);
+
+
 ////////// FOR DEBUGGING ///////////

 // little endian
--- a/src/obiarray.c
+++ b/src/obiarray.c
@ -1007,8 +1007,6 @@ index_t obi_array_add(OBIDMS_array_p array, byte_t* value)
 	(array->first)[idx] = data_size_used;

 	// Store the value itself at the end of the data
-//	fprintf(stderr, "\nMEMCOPYING TO STORE, with size %ld :", value_size);
-//	printBits(value_size, value);
 	memcpy((((array->data)->data)+data_size_used), value, value_size);

 	// Update the data size
@ -1090,10 +1088,11 @@ byte_t* obi_str_to_obibytes(char* value)
 {
 	byte_t* value_b;
 	int32_t length;
-	uint8_t size;

-	size = 8;
+	// Compute the number of bytes on which the value will be encoded
 	length = strlen(value) + 1;		// +1 to store \0 at the end (makes retrieving faster)
+
+	// Allocate the memory for the encoded value
 	value_b = (byte_t*) malloc(BYTE_ARRAY_HEADER_SIZE + length);
 	if (value_b == NULL)
 	{
@ -1102,11 +1101,16 @@ byte_t* obi_str_to_obibytes(char* value)
 		return NULL;
 	}

-	*(value_b) = size;
+	// Store the number of bits on which each element is encoded
+	*(value_b) = 8;

-	*((int32_t*)(value_b+1)) = length;	// TODO comment
+	// Store the length (in bytes) of the encoded value (same as decoded for character strings)
+	*((int32_t*)(value_b+1)) = length;
+
+	// Store the initial length (in bytes) of the decoded value (same as encoded for character strings)
 	*((int32_t*)(value_b+5)) = length;

+	// Store the character string
 	strcpy(value_b+BYTE_ARRAY_HEADER_SIZE, value);

 	return value_b;
@ -1132,13 +1136,12 @@ byte_t* obi_seq_to_obibytes(char* seq)
 	byte_t*  encoded_seq;

 	// Check if just ATGC and set size of a nucleotide accordingly (2 bits or 4 bits)
-	//fprintf(stderr, "\nonly ATGC = %d", only_ATGC(seq));
 	if (only_ATGC(seq))
 		size = 2;
 	else
 		size = 4;

-	// Set length
+	// Compute the length (in bytes) of the encoded sequence
 	seq_length = strlen(seq);
 	if (size == 2)
 		length = ceil((double) seq_length / (double) 4.0);
@ -1149,23 +1152,35 @@ byte_t* obi_seq_to_obibytes(char* seq)
 	if (size == 2)
 		encoded_seq = encode_seq_on_2_bits(seq, seq_length);
 	else	// size == 4
+		encoded_seq = encode_seq_on_4_bits(seq, seq_length);
+	if (encoded_seq == NULL)
+	{
+		obi_set_errno(OBI_ARRAY_ERROR);
+		obidebug(1, "\nError encoding a DNA sequence");
 		return NULL;
-		//		encoded_seq = encode_seq_on_4_bits(seq, seq_length);
+	}

-	// Set the values in the byte array
+	// Allocate the memory for the encoded value
 	value_b = (byte_t*) malloc(BYTE_ARRAY_HEADER_SIZE + length);
+	if (value_b == NULL)
+	{
+		obi_set_errno(OBI_ARRAY_ERROR);
+		obidebug(1, "\nError allocating memory for a byte array");
+		return NULL;
+	}

+	// Store the number of bits on which each nucleotide is encoded
 	*(value_b) = size;
+
+	// Store the length (in bytes) of the encoded sequence
 	*((int32_t*)(value_b+1)) = length;
+
+	// Store the length (in bytes) of the initial sequence (necessary for decoding)
 	*((int32_t*)(value_b+5)) = seq_length;

-	//fprintf(stderr, "\nstored seq length : %d\n", *((int32_t*)(value_b+5)));
-
+	// Store the encoded sequence
 	memcpy(value_b+BYTE_ARRAY_HEADER_SIZE, encoded_seq, length);

-	//obidebug(1, "\n\nENCODED VALUE_B = ");
-	//printBits(((*((int32_t*)(value_b+1)))+BYTE_ARRAY_HEADER_SIZE), value_b);
-
 	free(encoded_seq);

 	return value_b;
@ -1177,17 +1192,21 @@ const char* obi_obibytes_to_seq(byte_t* value_b)
 	const char* value;
 	uint8_t size;	    // size of one element in bits

-	//obidebug(1, "\n\nGONNA DECODE VALUE_B = ");
-	//printBits(((*((int32_t*)(value_b+1)))+BYTE_ARRAY_HEADER_SIZE), value_b);
-
+	// Check the encoding (each nucleotide on 2 bits or 4 bits)
 	size = *(value_b);

 	// Decode
 	if (size == 2)
 		value = decode_seq_on_2_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5)));
 	else
+		value = decode_seq_on_4_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5)));
+
+	if (value == NULL)
+	{
+		obi_set_errno(OBI_ARRAY_ERROR);
+		obidebug(1, "\nError decoding a DNA sequence");
 		return NULL;
-//		value = decode_seq_on_4_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5)));
+	}

 	return value;
 }
--- a/src/obiarray.h
+++ b/src/obiarray.h
@ -306,6 +306,7 @@ byte_t* obi_seq_to_obibytes(char* seq);
 * @param value_b The byte array to convert.
 *
 * @returns A pointer to the DNA sequence contained in the byte array.
+ * @retval NULL if an error occurred.
 *
 * @since November 2015
 * @author Celine Mercier (celine.mercier@metabarcoding.org)