DNA sequences are now encoded on 4 bits when they are in IUPAC

This commit is contained in:
Celine Mercier
2015-11-20 15:32:09 +01:00
parent 87044b41d8
commit 6aa2f92930
5 changed files with 318 additions and 54 deletions

View File

@ -58,12 +58,9 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
byte_t* seq_b;
uint8_t modulo;
int32_t length_b;
int32_t i;
// fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>Encoding sequence %s", seq);
int32_t i;
length_b = ceil((double) length / (double) 4.0);
// fprintf(stderr, "\nLength: %d", length_b);
seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));
@ -80,27 +77,19 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
{
case 'a':
case 'A':
seq_b[i/4] |= NUC_A;
// fprintf(stderr, "\nIn byte %d, writing A:", i/4);
// print_bits(seq_b, length_b);
seq_b[i/4] |= NUC_A_2b;
break;
case 'c':
case 'C':
seq_b[i/4] |= NUC_C;
// fprintf(stderr, "\nIn byte %d, writing C:", i/4);
// print_bits(seq_b, length_b);
seq_b[i/4] |= NUC_C_2b;
break;
case 'g':
case 'G':
seq_b[i/4] |= NUC_G;
// fprintf(stderr, "\nIn byte %d, writing G:", i/4);
// print_bits(seq_b, length_b);
seq_b[i/4] |= NUC_G_2b;
break;
case 't':
case 'T':
seq_b[i/4] |= NUC_T;
// fprintf(stderr, "\nIn byte %d, writing T:", i/4);
// print_bits(seq_b, length_b);
seq_b[i/4] |= NUC_T_2b;
break;
default:
obidebug(1, "\nInvalid nucleotide base when encoding (not [atgcATGC])");
@ -113,9 +102,6 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
if (modulo)
seq_b[(i-1)/4] <<= (2*(4 - modulo));
// fprintf(stderr, "\n>>>>>>>>>Encoded:");
// print_bits(seq_b, length_b);
return seq_b;
}
@ -123,7 +109,7 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
{
char* seq;
int32_t i;
int32_t i;
uint8_t shift;
uint8_t mask;
uint8_t nuc;
@ -133,21 +119,21 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
for (i=0; i<length_seq; i++)
{
shift = 6 - 2*(i % 4);
mask = NUC_MASK << shift;
mask = NUC_MASK_2B << shift;
nuc = (seq_b[i/4] & mask) >> shift;
switch (nuc)
{
case NUC_A:
case NUC_A_2b:
seq[i] = 'a';
break;
case NUC_C:
case NUC_C_2b:
seq[i] = 'c';
break;
case NUC_G:
case NUC_G_2b:
seq[i] = 'g';
break;
case NUC_T:
case NUC_T_2b:
seq[i] = 't';
break;
default:
@ -162,6 +148,178 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
}
byte_t* encode_seq_on_4_bits(char* seq, int32_t length)
{
byte_t* seq_b;
uint8_t modulo;
int32_t length_b;
int32_t i;
length_b = ceil((double) length / (double) 2.0);
seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));
// Initialize all the bits to 0
memset(seq_b, 0, length_b);
for (i=0; i<length; i++)
{
// Shift of 4 to make place for new nucleotide
seq_b[i/2] <<= 4;
// Add new nucleotide
switch (seq[i])
{
case 'a':
case 'A':
seq_b[i/2] |= NUC_A_4b;
break;
case 'c':
case 'C':
seq_b[i/2] |= NUC_C_4b;
break;
case 'g':
case 'G':
seq_b[i/2] |= NUC_G_4b;
break;
case 't':
case 'T':
seq_b[i/2] |= NUC_T_4b;
break;
case 'r':
case 'R':
seq_b[i/2] |= NUC_R_4b;
break;
case 'y':
case 'Y':
seq_b[i/2] |= NUC_Y_4b;
break;
case 's':
case 'S':
seq_b[i/2] |= NUC_S_4b;
break;
case 'w':
case 'W':
seq_b[i/2] |= NUC_W_4b;
break;
case 'k':
case 'K':
seq_b[i/2] |= NUC_K_4b;
break;
case 'm':
case 'M':
seq_b[i/2] |= NUC_M_4b;
break;
case 'b':
case 'B':
seq_b[i/2] |= NUC_B_4b;
break;
case 'd':
case 'D':
seq_b[i/2] |= NUC_D_4b;
break;
case 'h':
case 'H':
seq_b[i/2] |= NUC_H_4b;
break;
case 'v':
case 'V':
seq_b[i/2] |= NUC_V_4b;
break;
case 'n':
case 'N':
seq_b[i/2] |= NUC_N_4b;
break;
default:
obidebug(1, "\nInvalid nucleotide base when encoding (not IUPAC)");
return NULL;
}
}
// Final shift for the last byte if needed
modulo = (length % 2);
if (modulo)
seq_b[(i-1)/2] <<= (4*modulo);
return seq_b;
}
char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq)
{
char* seq;
int32_t i;
uint8_t shift;
uint8_t mask;
uint8_t nuc;
seq = (char*) malloc((length_seq+1) * sizeof(char));
for (i=0; i<length_seq; i++)
{
shift = 4 - 4*(i % 2);
mask = NUC_MASK_4B << shift;
nuc = (seq_b[i/2] & mask) >> shift;
switch (nuc)
{
case NUC_A_4b:
seq[i] = 'a';
break;
case NUC_C_4b:
seq[i] = 'c';
break;
case NUC_G_4b:
seq[i] = 'g';
break;
case NUC_T_4b:
seq[i] = 't';
break;
case NUC_R_4b:
seq[i] = 'r';
break;
case NUC_Y_4b:
seq[i] = 'y';
break;
case NUC_S_4b:
seq[i] = 's';
break;
case NUC_W_4b:
seq[i] = 'w';
break;
case NUC_K_4b:
seq[i] = 'k';
break;
case NUC_M_4b:
seq[i] = 'm';
break;
case NUC_B_4b:
seq[i] = 'b';
break;
case NUC_D_4b:
seq[i] = 'd';
break;
case NUC_H_4b:
seq[i] = 'h';
break;
case NUC_V_4b:
seq[i] = 'v';
break;
case NUC_N_4b:
seq[i] = 'n';
break;
default:
obidebug(1, "\nInvalid nucleotide base when decoding");
return NULL;
}
}
seq[length_seq] = '\0';
return seq;
}
////////// FOR DEBUGGING ///////////
// little endian