diff --git a/python/obitools3/unit_tests.py b/python/obitools3/unit_tests.py index 93112d2..850e06f 100644 --- a/python/obitools3/unit_tests.py +++ b/python/obitools3/unit_tests.py @@ -65,7 +65,7 @@ def random_obivalue(data_type): return randoms elif data_type == "OBI_SEQ" : length = randint(1,200) - randoms = ''.join(choice("atgc") for i in range(length)) + randoms = ''.join(choice("atgcryswkmdbhvn") for i in range(length)) return randoms class OBIDMS_Column_TestCase(unittest.TestCase): diff --git a/src/encode.c b/src/encode.c index 80b1350..2f9362b 100644 --- a/src/encode.c +++ b/src/encode.c @@ -58,12 +58,9 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length) byte_t* seq_b; uint8_t modulo; int32_t length_b; - int32_t i; - -// fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>Encoding sequence %s", seq); + int32_t i; length_b = ceil((double) length / (double) 4.0); -// fprintf(stderr, "\nLength: %d", length_b); seq_b = (byte_t*) malloc(length_b * sizeof(byte_t)); @@ -80,27 +77,19 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length) { case 'a': case 'A': - seq_b[i/4] |= NUC_A; -// fprintf(stderr, "\nIn byte %d, writing A:", i/4); -// print_bits(seq_b, length_b); + seq_b[i/4] |= NUC_A_2b; break; case 'c': case 'C': - seq_b[i/4] |= NUC_C; -// fprintf(stderr, "\nIn byte %d, writing C:", i/4); -// print_bits(seq_b, length_b); + seq_b[i/4] |= NUC_C_2b; break; case 'g': case 'G': - seq_b[i/4] |= NUC_G; -// fprintf(stderr, "\nIn byte %d, writing G:", i/4); -// print_bits(seq_b, length_b); + seq_b[i/4] |= NUC_G_2b; break; case 't': case 'T': - seq_b[i/4] |= NUC_T; -// fprintf(stderr, "\nIn byte %d, writing T:", i/4); -// print_bits(seq_b, length_b); + seq_b[i/4] |= NUC_T_2b; break; default: obidebug(1, "\nInvalid nucleotide base when encoding (not [atgcATGC])"); @@ -113,9 +102,6 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length) if (modulo) seq_b[(i-1)/4] <<= (2*(4 - modulo)); -// fprintf(stderr, "\n>>>>>>>>>Encoded:"); -// print_bits(seq_b, length_b); - return seq_b; } @@ -123,7 +109,7 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length) char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq) { char* seq; - int32_t i; + int32_t i; uint8_t shift; uint8_t mask; uint8_t nuc; @@ -133,21 +119,21 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq) for (i=0; i> shift; switch (nuc) { - case NUC_A: + case NUC_A_2b: seq[i] = 'a'; break; - case NUC_C: + case NUC_C_2b: seq[i] = 'c'; break; - case NUC_G: + case NUC_G_2b: seq[i] = 'g'; break; - case NUC_T: + case NUC_T_2b: seq[i] = 't'; break; default: @@ -162,6 +148,178 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq) } +byte_t* encode_seq_on_4_bits(char* seq, int32_t length) +{ + byte_t* seq_b; + uint8_t modulo; + int32_t length_b; + int32_t i; + + length_b = ceil((double) length / (double) 2.0); + + seq_b = (byte_t*) malloc(length_b * sizeof(byte_t)); + + // Initialize all the bits to 0 + memset(seq_b, 0, length_b); + + for (i=0; i> shift; + + switch (nuc) + { + case NUC_A_4b: + seq[i] = 'a'; + break; + case NUC_C_4b: + seq[i] = 'c'; + break; + case NUC_G_4b: + seq[i] = 'g'; + break; + case NUC_T_4b: + seq[i] = 't'; + break; + case NUC_R_4b: + seq[i] = 'r'; + break; + case NUC_Y_4b: + seq[i] = 'y'; + break; + case NUC_S_4b: + seq[i] = 's'; + break; + case NUC_W_4b: + seq[i] = 'w'; + break; + case NUC_K_4b: + seq[i] = 'k'; + break; + case NUC_M_4b: + seq[i] = 'm'; + break; + case NUC_B_4b: + seq[i] = 'b'; + break; + case NUC_D_4b: + seq[i] = 'd'; + break; + case NUC_H_4b: + seq[i] = 'h'; + break; + case NUC_V_4b: + seq[i] = 'v'; + break; + case NUC_N_4b: + seq[i] = 'n'; + break; + default: + obidebug(1, "\nInvalid nucleotide base when decoding"); + return NULL; + } + } + + seq[length_seq] = '\0'; + + return seq; +} + + ////////// FOR DEBUGGING /////////// // little endian diff --git a/src/encode.h b/src/encode.h index 5a1bac8..9bb7954 100644 --- a/src/encode.h +++ b/src/encode.h @@ -18,7 +18,8 @@ #include "obiarray.h" -#define NUC_MASK 0x3 /**< Binary: 11 to use when decoding */ +#define NUC_MASK_2B 0x3 /**< Binary: 11 to use when decoding 2 bits sequences */ +#define NUC_MASK_4B 0xF /**< Binary: 1111 to use when decoding 4 bits sequences */ /** @@ -26,10 +27,33 @@ */ enum { - NUC_A = 0x0, /* binary: 00 */ - NUC_C = 0x1, /* binary: 01 */ - NUC_G = 0x2, /* binary: 10 */ - NUC_T = 0x3, /* binary: 11 */ + NUC_A_2b = 0x0, /* binary: 00 */ + NUC_C_2b = 0x1, /* binary: 01 */ + NUC_G_2b = 0x2, /* binary: 10 */ + NUC_T_2b = 0x3, /* binary: 11 */ +}; + + +/** + * @brief enum for the 4-bits codes for each of the 15 IUPAC nucleotides. + */ +enum +{ + NUC_A_4b = 0x1, /* binary: 0001 */ + NUC_C_4b = 0x2, /* binary: 0010 */ + NUC_G_4b = 0x3, /* binary: 0011 */ + NUC_T_4b = 0x4, /* binary: 0100 */ + NUC_R_4b = 0x5, /* binary: 0101 */ + NUC_Y_4b = 0x6, /* binary: 0110 */ + NUC_S_4b = 0x7, /* binary: 0111 */ + NUC_W_4b = 0x8, /* binary: 1000 */ + NUC_K_4b = 0x9, /* binary: 1001 */ + NUC_M_4b = 0xA, /* binary: 1010 */ + NUC_B_4b = 0xB, /* binary: 1011 */ + NUC_D_4b = 0xC, /* binary: 1100 */ + NUC_H_4b = 0xD, /* binary: 1101 */ + NUC_V_4b = 0xE, /* binary: 1110 */ + NUC_N_4b = 0xF, /* binary: 1111 */ }; @@ -72,10 +96,10 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length); /** * @brief Decodes a DNA sequence that is coded with each nucleotide on 2 bits. * - * A or a : 00 - * C or c : 01 - * T or t : 10 - * G or g : 11 + * 00 -> a + * 01 -> c + * 10 -> t + * 11 -> g * * @param seq The sequence to decode. * @param length_seq The initial length of the sequence before it was encoded. @@ -88,6 +112,68 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length); char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq); +/** + * @brief Encodes a DNA sequence with each nucleotide coded on 4 bits. + * + * A or a : 0001 + * C or c : 0010 + * G or g : 0011 + * T or t : 0100 + * R or r : 0101 + * Y or y : 0110 + * S or s : 0111 + * W or w : 1000 + * K or k : 1001 + * M or m : 1010 + * B or b : 1011 + * D or d : 1100 + * H or h : 1101 + * V or v : 1110 + * N or n : 1111 + * + * @warning The DNA sequence must contain only IUPAC characters. + * + * @param seq The sequence to encode. + * @param length The length of the sequence to encode. + * + * @returns The encoded sequence. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +byte_t* encode_seq_on_4_bits(char* seq, int32_t length); + + +/** + * @brief Decodes a DNA sequence that is coded with each nucleotide on 4 bits. + * + * A or a : 0001 + * C or c : 0010 + * G or g : 0011 + * T or t : 0100 + * R or r : 0101 + * Y or y : 0110 + * S or s : 0111 + * W or w : 1000 + * K or k : 1001 + * M or m : 1010 + * B or b : 1011 + * D or d : 1100 + * H or h : 1101 + * V or v : 1110 + * N or n : 1111 + * + * @param seq The sequence to decode. + * @param length_seq The initial length of the sequence before it was encoded. + * + * @returns The decoded sequence ended with '\0'. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq); + + ////////// FOR DEBUGGING /////////// // little endian diff --git a/src/obiarray.c b/src/obiarray.c index a4882c6..8e04003 100644 --- a/src/obiarray.c +++ b/src/obiarray.c @@ -1007,8 +1007,6 @@ index_t obi_array_add(OBIDMS_array_p array, byte_t* value) (array->first)[idx] = data_size_used; // Store the value itself at the end of the data -// fprintf(stderr, "\nMEMCOPYING TO STORE, with size %ld :", value_size); -// printBits(value_size, value); memcpy((((array->data)->data)+data_size_used), value, value_size); // Update the data size @@ -1090,10 +1088,11 @@ byte_t* obi_str_to_obibytes(char* value) { byte_t* value_b; int32_t length; - uint8_t size; - size = 8; + // Compute the number of bytes on which the value will be encoded length = strlen(value) + 1; // +1 to store \0 at the end (makes retrieving faster) + + // Allocate the memory for the encoded value value_b = (byte_t*) malloc(BYTE_ARRAY_HEADER_SIZE + length); if (value_b == NULL) { @@ -1102,11 +1101,16 @@ byte_t* obi_str_to_obibytes(char* value) return NULL; } - *(value_b) = size; + // Store the number of bits on which each element is encoded + *(value_b) = 8; - *((int32_t*)(value_b+1)) = length; // TODO comment + // Store the length (in bytes) of the encoded value (same as decoded for character strings) + *((int32_t*)(value_b+1)) = length; + + // Store the initial length (in bytes) of the decoded value (same as encoded for character strings) *((int32_t*)(value_b+5)) = length; + // Store the character string strcpy(value_b+BYTE_ARRAY_HEADER_SIZE, value); return value_b; @@ -1132,13 +1136,12 @@ byte_t* obi_seq_to_obibytes(char* seq) byte_t* encoded_seq; // Check if just ATGC and set size of a nucleotide accordingly (2 bits or 4 bits) - //fprintf(stderr, "\nonly ATGC = %d", only_ATGC(seq)); if (only_ATGC(seq)) size = 2; else size = 4; - // Set length + // Compute the length (in bytes) of the encoded sequence seq_length = strlen(seq); if (size == 2) length = ceil((double) seq_length / (double) 4.0); @@ -1149,23 +1152,35 @@ byte_t* obi_seq_to_obibytes(char* seq) if (size == 2) encoded_seq = encode_seq_on_2_bits(seq, seq_length); else // size == 4 + encoded_seq = encode_seq_on_4_bits(seq, seq_length); + if (encoded_seq == NULL) + { + obi_set_errno(OBI_ARRAY_ERROR); + obidebug(1, "\nError encoding a DNA sequence"); return NULL; - // encoded_seq = encode_seq_on_4_bits(seq, seq_length); + } - // Set the values in the byte array + // Allocate the memory for the encoded value value_b = (byte_t*) malloc(BYTE_ARRAY_HEADER_SIZE + length); + if (value_b == NULL) + { + obi_set_errno(OBI_ARRAY_ERROR); + obidebug(1, "\nError allocating memory for a byte array"); + return NULL; + } + // Store the number of bits on which each nucleotide is encoded *(value_b) = size; + + // Store the length (in bytes) of the encoded sequence *((int32_t*)(value_b+1)) = length; + + // Store the length (in bytes) of the initial sequence (necessary for decoding) *((int32_t*)(value_b+5)) = seq_length; - //fprintf(stderr, "\nstored seq length : %d\n", *((int32_t*)(value_b+5))); - + // Store the encoded sequence memcpy(value_b+BYTE_ARRAY_HEADER_SIZE, encoded_seq, length); - //obidebug(1, "\n\nENCODED VALUE_B = "); - //printBits(((*((int32_t*)(value_b+1)))+BYTE_ARRAY_HEADER_SIZE), value_b); - free(encoded_seq); return value_b; @@ -1177,17 +1192,21 @@ const char* obi_obibytes_to_seq(byte_t* value_b) const char* value; uint8_t size; // size of one element in bits - //obidebug(1, "\n\nGONNA DECODE VALUE_B = "); - //printBits(((*((int32_t*)(value_b+1)))+BYTE_ARRAY_HEADER_SIZE), value_b); - + // Check the encoding (each nucleotide on 2 bits or 4 bits) size = *(value_b); // Decode if (size == 2) value = decode_seq_on_2_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5))); else + value = decode_seq_on_4_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5))); + + if (value == NULL) + { + obi_set_errno(OBI_ARRAY_ERROR); + obidebug(1, "\nError decoding a DNA sequence"); return NULL; -// value = decode_seq_on_4_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5))); + } return value; } diff --git a/src/obiarray.h b/src/obiarray.h index cfcaf12..eaf1486 100644 --- a/src/obiarray.h +++ b/src/obiarray.h @@ -306,6 +306,7 @@ byte_t* obi_seq_to_obibytes(char* seq); * @param value_b The byte array to convert. * * @returns A pointer to the DNA sequence contained in the byte array. + * @retval NULL if an error occurred. * * @since November 2015 * @author Celine Mercier (celine.mercier@metabarcoding.org)