2015-11-19 18:12:48 +01:00
|
|
|
/****************************************************************************
|
|
|
|
* Encoding functions *
|
|
|
|
****************************************************************************/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @file encode.c
|
|
|
|
* @author Celine Mercier
|
|
|
|
* @date November 18th 2015
|
|
|
|
* @brief Functions encoding DNA sequences.
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <math.h>
|
|
|
|
|
|
|
|
#include "encode.h"
|
2015-12-11 17:24:44 +01:00
|
|
|
#include "obierrno.h"
|
|
|
|
#include "obitypes.h" // For byte_t type
|
2015-11-19 18:12:48 +01:00
|
|
|
#include "obidebug.h"
|
|
|
|
|
|
|
|
|
|
|
|
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
|
|
|
|
|
|
|
|
|
|
|
|
// TODO: endianness problem?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool only_ATGC(char* seq)
|
|
|
|
{
|
|
|
|
char* c = seq;
|
|
|
|
|
|
|
|
while (*c)
|
|
|
|
{
|
|
|
|
if (!((*c == 'A') || \
|
|
|
|
(*c == 'T') || \
|
|
|
|
(*c == 'G') || \
|
|
|
|
(*c == 'C') || \
|
|
|
|
(*c == 'a') || \
|
|
|
|
(*c == 't') || \
|
|
|
|
(*c == 'g') || \
|
|
|
|
(*c == 'c')))
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
c++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-11-20 11:32:47 +01:00
|
|
|
byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
|
2015-11-19 18:12:48 +01:00
|
|
|
{
|
|
|
|
byte_t* seq_b;
|
2015-11-20 11:32:47 +01:00
|
|
|
uint8_t modulo;
|
2015-11-19 18:12:48 +01:00
|
|
|
int32_t length_b;
|
2015-11-20 15:32:09 +01:00
|
|
|
int32_t i;
|
2015-11-19 18:12:48 +01:00
|
|
|
|
|
|
|
length_b = ceil((double) length / (double) 4.0);
|
|
|
|
|
|
|
|
seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));
|
|
|
|
|
2015-11-20 11:32:47 +01:00
|
|
|
// Initialize all the bits to 0
|
2015-11-19 18:12:48 +01:00
|
|
|
memset(seq_b, 0, length_b);
|
|
|
|
|
|
|
|
for (i=0; i<length; i++)
|
|
|
|
{
|
2015-11-20 11:32:47 +01:00
|
|
|
// Shift of 2 to make place for new nucleotide
|
|
|
|
seq_b[i/4] <<= 2;
|
2015-11-19 18:12:48 +01:00
|
|
|
|
2015-11-20 11:32:47 +01:00
|
|
|
// Add new nucleotide
|
2015-11-19 18:12:48 +01:00
|
|
|
switch (seq[i])
|
|
|
|
{
|
|
|
|
case 'a':
|
|
|
|
case 'A':
|
2015-11-20 15:32:09 +01:00
|
|
|
seq_b[i/4] |= NUC_A_2b;
|
2015-11-19 18:12:48 +01:00
|
|
|
break;
|
|
|
|
case 'c':
|
|
|
|
case 'C':
|
2015-11-20 15:32:09 +01:00
|
|
|
seq_b[i/4] |= NUC_C_2b;
|
2015-11-19 18:12:48 +01:00
|
|
|
break;
|
|
|
|
case 'g':
|
|
|
|
case 'G':
|
2015-11-20 15:32:09 +01:00
|
|
|
seq_b[i/4] |= NUC_G_2b;
|
2015-11-19 18:12:48 +01:00
|
|
|
break;
|
|
|
|
case 't':
|
|
|
|
case 'T':
|
2015-11-20 15:32:09 +01:00
|
|
|
seq_b[i/4] |= NUC_T_2b;
|
2015-11-19 18:12:48 +01:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
obidebug(1, "\nInvalid nucleotide base when encoding (not [atgcATGC])");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-11-20 11:32:47 +01:00
|
|
|
// Final shift for the last byte if needed
|
|
|
|
modulo = (length % 4);
|
|
|
|
if (modulo)
|
|
|
|
seq_b[(i-1)/4] <<= (2*(4 - modulo));
|
|
|
|
|
2015-11-19 18:12:48 +01:00
|
|
|
return seq_b;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
|
|
|
|
{
|
|
|
|
char* seq;
|
2015-11-20 15:32:09 +01:00
|
|
|
int32_t i;
|
2015-11-19 18:12:48 +01:00
|
|
|
uint8_t shift;
|
|
|
|
uint8_t mask;
|
|
|
|
uint8_t nuc;
|
|
|
|
|
|
|
|
seq = (char*) malloc((length_seq+1) * sizeof(char));
|
|
|
|
|
|
|
|
for (i=0; i<length_seq; i++)
|
|
|
|
{
|
|
|
|
shift = 6 - 2*(i % 4);
|
2015-11-20 15:32:09 +01:00
|
|
|
mask = NUC_MASK_2B << shift;
|
2015-11-19 18:12:48 +01:00
|
|
|
nuc = (seq_b[i/4] & mask) >> shift;
|
|
|
|
|
|
|
|
switch (nuc)
|
|
|
|
{
|
2015-11-20 15:32:09 +01:00
|
|
|
case NUC_A_2b:
|
2015-11-19 18:12:48 +01:00
|
|
|
seq[i] = 'a';
|
|
|
|
break;
|
2015-11-20 15:32:09 +01:00
|
|
|
case NUC_C_2b:
|
2015-11-19 18:12:48 +01:00
|
|
|
seq[i] = 'c';
|
|
|
|
break;
|
2015-11-20 15:32:09 +01:00
|
|
|
case NUC_G_2b:
|
2015-11-19 18:12:48 +01:00
|
|
|
seq[i] = 'g';
|
|
|
|
break;
|
2015-11-20 15:32:09 +01:00
|
|
|
case NUC_T_2b:
|
2015-11-19 18:12:48 +01:00
|
|
|
seq[i] = 't';
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
obidebug(1, "\nInvalid nucleotide base when decoding");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
seq[length_seq] = '\0';
|
|
|
|
|
|
|
|
return seq;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-11-20 15:32:09 +01:00
|
|
|
byte_t* encode_seq_on_4_bits(char* seq, int32_t length)
|
|
|
|
{
|
|
|
|
byte_t* seq_b;
|
|
|
|
uint8_t modulo;
|
|
|
|
int32_t length_b;
|
|
|
|
int32_t i;
|
|
|
|
|
|
|
|
length_b = ceil((double) length / (double) 2.0);
|
|
|
|
|
|
|
|
seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));
|
|
|
|
|
|
|
|
// Initialize all the bits to 0
|
|
|
|
memset(seq_b, 0, length_b);
|
|
|
|
|
|
|
|
for (i=0; i<length; i++)
|
|
|
|
{
|
|
|
|
// Shift of 4 to make place for new nucleotide
|
|
|
|
seq_b[i/2] <<= 4;
|
|
|
|
|
|
|
|
// Add new nucleotide
|
|
|
|
switch (seq[i])
|
|
|
|
{
|
|
|
|
case 'a':
|
|
|
|
case 'A':
|
|
|
|
seq_b[i/2] |= NUC_A_4b;
|
|
|
|
break;
|
|
|
|
case 'c':
|
|
|
|
case 'C':
|
|
|
|
seq_b[i/2] |= NUC_C_4b;
|
|
|
|
break;
|
|
|
|
case 'g':
|
|
|
|
case 'G':
|
|
|
|
seq_b[i/2] |= NUC_G_4b;
|
|
|
|
break;
|
|
|
|
case 't':
|
|
|
|
case 'T':
|
|
|
|
seq_b[i/2] |= NUC_T_4b;
|
|
|
|
break;
|
|
|
|
case 'r':
|
|
|
|
case 'R':
|
|
|
|
seq_b[i/2] |= NUC_R_4b;
|
|
|
|
break;
|
|
|
|
case 'y':
|
|
|
|
case 'Y':
|
|
|
|
seq_b[i/2] |= NUC_Y_4b;
|
|
|
|
break;
|
|
|
|
case 's':
|
|
|
|
case 'S':
|
|
|
|
seq_b[i/2] |= NUC_S_4b;
|
|
|
|
break;
|
|
|
|
case 'w':
|
|
|
|
case 'W':
|
|
|
|
seq_b[i/2] |= NUC_W_4b;
|
|
|
|
break;
|
|
|
|
case 'k':
|
|
|
|
case 'K':
|
|
|
|
seq_b[i/2] |= NUC_K_4b;
|
|
|
|
break;
|
|
|
|
case 'm':
|
|
|
|
case 'M':
|
|
|
|
seq_b[i/2] |= NUC_M_4b;
|
|
|
|
break;
|
|
|
|
case 'b':
|
|
|
|
case 'B':
|
|
|
|
seq_b[i/2] |= NUC_B_4b;
|
|
|
|
break;
|
|
|
|
case 'd':
|
|
|
|
case 'D':
|
|
|
|
seq_b[i/2] |= NUC_D_4b;
|
|
|
|
break;
|
|
|
|
case 'h':
|
|
|
|
case 'H':
|
|
|
|
seq_b[i/2] |= NUC_H_4b;
|
|
|
|
break;
|
|
|
|
case 'v':
|
|
|
|
case 'V':
|
|
|
|
seq_b[i/2] |= NUC_V_4b;
|
|
|
|
break;
|
|
|
|
case 'n':
|
|
|
|
case 'N':
|
|
|
|
seq_b[i/2] |= NUC_N_4b;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
obidebug(1, "\nInvalid nucleotide base when encoding (not IUPAC)");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Final shift for the last byte if needed
|
|
|
|
modulo = (length % 2);
|
|
|
|
if (modulo)
|
|
|
|
seq_b[(i-1)/2] <<= (4*modulo);
|
|
|
|
|
|
|
|
return seq_b;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq)
|
|
|
|
{
|
|
|
|
char* seq;
|
|
|
|
int32_t i;
|
|
|
|
uint8_t shift;
|
|
|
|
uint8_t mask;
|
|
|
|
uint8_t nuc;
|
|
|
|
|
|
|
|
seq = (char*) malloc((length_seq+1) * sizeof(char));
|
|
|
|
|
|
|
|
for (i=0; i<length_seq; i++)
|
|
|
|
{
|
|
|
|
shift = 4 - 4*(i % 2);
|
|
|
|
mask = NUC_MASK_4B << shift;
|
|
|
|
nuc = (seq_b[i/2] & mask) >> shift;
|
|
|
|
|
|
|
|
switch (nuc)
|
|
|
|
{
|
|
|
|
case NUC_A_4b:
|
|
|
|
seq[i] = 'a';
|
|
|
|
break;
|
|
|
|
case NUC_C_4b:
|
|
|
|
seq[i] = 'c';
|
|
|
|
break;
|
|
|
|
case NUC_G_4b:
|
|
|
|
seq[i] = 'g';
|
|
|
|
break;
|
|
|
|
case NUC_T_4b:
|
|
|
|
seq[i] = 't';
|
|
|
|
break;
|
|
|
|
case NUC_R_4b:
|
|
|
|
seq[i] = 'r';
|
|
|
|
break;
|
|
|
|
case NUC_Y_4b:
|
|
|
|
seq[i] = 'y';
|
|
|
|
break;
|
|
|
|
case NUC_S_4b:
|
|
|
|
seq[i] = 's';
|
|
|
|
break;
|
|
|
|
case NUC_W_4b:
|
|
|
|
seq[i] = 'w';
|
|
|
|
break;
|
|
|
|
case NUC_K_4b:
|
|
|
|
seq[i] = 'k';
|
|
|
|
break;
|
|
|
|
case NUC_M_4b:
|
|
|
|
seq[i] = 'm';
|
|
|
|
break;
|
|
|
|
case NUC_B_4b:
|
|
|
|
seq[i] = 'b';
|
|
|
|
break;
|
|
|
|
case NUC_D_4b:
|
|
|
|
seq[i] = 'd';
|
|
|
|
break;
|
|
|
|
case NUC_H_4b:
|
|
|
|
seq[i] = 'h';
|
|
|
|
break;
|
|
|
|
case NUC_V_4b:
|
|
|
|
seq[i] = 'v';
|
|
|
|
break;
|
|
|
|
case NUC_N_4b:
|
|
|
|
seq[i] = 'n';
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
obidebug(1, "\nInvalid nucleotide base when decoding");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
seq[length_seq] = '\0';
|
|
|
|
|
|
|
|
return seq;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-11-29 11:57:07 +01:00
|
|
|
///////////////////// FOR DEBUGGING ///////////////////////////
|
|
|
|
//NOTE: The first byte is printed the first (at the left-most).
|
2015-11-19 18:12:48 +01:00
|
|
|
|
|
|
|
void print_bits(void* ptr, int32_t size)
|
|
|
|
{
|
|
|
|
uint8_t* b = (uint8_t*) ptr;
|
|
|
|
uint8_t byte;
|
|
|
|
int32_t i, j;
|
|
|
|
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
for (i=0;i<size;i++)
|
|
|
|
{
|
|
|
|
for (j=7;j>=0;j--)
|
|
|
|
{
|
|
|
|
byte = b[i] & (1<<j);
|
|
|
|
byte >>= j;
|
|
|
|
fprintf(stderr, "%u", byte);
|
|
|
|
}
|
|
|
|
fprintf(stderr, " ");
|
|
|
|
}
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
}
|