New column type for DNA sequences. Only for those coded on 2 bits (only

'ATGCatgc') for now.
This commit is contained in:
Celine Mercier
2015-11-19 18:12:48 +01:00
parent e371248567
commit 6ab1c83302
17 changed files with 860 additions and 29 deletions

View File

@ -14,3 +14,5 @@
../../../src/private_at_functions.c
../../../src/obiarray.h
../../../src/obiarray.c
../../../src/encode.h
../../../src/encode.c

View File

@ -47,6 +47,11 @@ from ._obidmscolumn_str cimport OBIDMS_column_str, \
OBIDMS_column_str_multi_elts, \
OBIDMS_column_str_multi_elts_writable
from ._obidmscolumn_seq cimport OBIDMS_column_seq, \
OBIDMS_column_seq_writable, \
OBIDMS_column_seq_multi_elts, \
OBIDMS_column_seq_multi_elts_writable
cdef class OBIDMS :
@ -215,6 +220,17 @@ cdef class OBIDMS :
subclass = OBIDMS_column_str
else :
subclass = OBIDMS_column_str_multi_elts
elif data_type == 6 :
if (create or clone) :
if nb_elements_per_line == 1 :
subclass = OBIDMS_column_seq_writable
else :
subclass = OBIDMS_column_seq_multi_elts_writable
else :
if nb_elements_per_line == 1 :
subclass = OBIDMS_column_seq
else :
subclass = OBIDMS_column_seq_multi_elts
else :
raise Exception("Problem with the data type")
@ -238,7 +254,7 @@ cdef class OBIDMS_column :
bint create,
bint clone, bint clone_data,
obiversion_t version_number,
OBIType_t type,
OBIType_t type, # There's a problem with this with the OBI_IDX columns as there are 2 subtypes
index_t nb_lines,
index_t nb_elements_per_line,
list elements_names,

View File

@ -0,0 +1,18 @@
../../../src/obidmscolumn_seq.c
../../../src/obidmscolumn_seq.h
../../../src/obidmscolumn.h
../../../src/obidmscolumn.c
../../../src/obidmscolumndir.h
../../../src/obidmscolumndir.c
../../../src/obidms.h
../../../src/obidms.c
../../../src/obierrno.h
../../../src/obierrno.c
../../../src/obilittlebigman.h
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiarray.h
../../../src/obiarray.c

View File

@ -0,0 +1,25 @@
#cython: language_level=3
from .capi.obitypes cimport index_t
from ._obidms cimport OBIDMS_column
cdef class OBIDMS_column_seq(OBIDMS_column):
cpdef object get_line(self, index_t line_nb)
cpdef set_line(self, index_t line_nb, object value)
cpdef close(self)
cdef class OBIDMS_column_seq_writable(OBIDMS_column_seq):
cpdef set_line(self, index_t line_nb, object value)
cpdef close(self)
cdef class OBIDMS_column_seq_multi_elts(OBIDMS_column_seq):
cpdef object get_item(self, index_t line_nb, str element_name)
cpdef object get_line(self, index_t line_nb)
cpdef set_item(self, index_t line_nb, str element_name, str value)
cpdef set_line(self, index_t line_nb, object values)
cdef class OBIDMS_column_seq_multi_elts_writable(OBIDMS_column_seq_multi_elts):
cpdef set_item(self, index_t line_nb, str element_name, str value)
cpdef set_line(self, index_t line_nb, object values)
cpdef close(self)

View File

@ -0,0 +1,103 @@
#cython: language_level=3
from .capi.obidmscolumn cimport obi_close_column,\
obi_truncate_and_close_column, \
obi_column_get_obiseq_with_elt_name, \
obi_column_get_obiseq_with_elt_idx, \
obi_column_set_obiseq_with_elt_name, \
obi_column_set_obiseq_with_elt_idx
from .capi.obierrno cimport obi_errno
from .capi.obitypes cimport OBIIdx_NA, const_char_p
from obitools3.utils cimport str2bytes, bytes2str
cdef class OBIDMS_column_seq(OBIDMS_column):
cpdef object get_line(self, index_t line_nb):
cdef bytes value
cdef object result
value = <bytes> obi_column_get_obiseq_with_elt_idx(self.pointer, line_nb, 0)
if obi_errno > 0 :
raise IndexError(line_nb)
if value == OBIIdx_NA :
result = None
else :
result = bytes2str(value)
return result
cpdef set_line(self, index_t line_nb, object value):
raise Exception("Column is read-only")
cpdef close(self):
if obi_close_column(self.pointer) < 0 :
raise Exception("Problem closing a column")
cdef class OBIDMS_column_seq_writable(OBIDMS_column_seq):
cpdef set_line(self, index_t line_nb, object value):
if obi_column_set_obiseq_with_elt_idx(self.pointer, line_nb, 0, str2bytes(value)) < 0:
raise Exception("Problem setting a value in a column")
cpdef close(self):
if obi_truncate_and_close_column(self.pointer) < 0 :
raise Exception("Problem closing a column")
cdef class OBIDMS_column_seq_multi_elts(OBIDMS_column_seq):
cpdef object get_item(self, index_t line_nb, str element_name):
cdef bytes value
cdef object result
value = <bytes> obi_column_get_obiseq_with_elt_name(self.pointer, line_nb, str2bytes(element_name))
if obi_errno > 0 :
raise IndexError(line_nb, element_name)
if value == OBIIdx_NA :
result = None
else :
result = bytes2str(value)
return result
cpdef object get_line(self, index_t line_nb) :
cdef bytes value
cdef object result
cdef index_t i
cdef bint all_NA
result = {}
all_NA = True
for i in range(self.nb_elements_per_line) :
value = <bytes> obi_column_get_obiseq_with_elt_idx(self.pointer, line_nb, i)
if obi_errno > 0 :
raise IndexError(line_nb)
result[self.elements_names[i]] = bytes2str(value)
if all_NA and (value != OBIIdx_NA) :
all_NA = False
if all_NA :
result = None
return result
cpdef set_item(self, index_t line_nb, str element_name, str value):
raise Exception("Column is read-only")
cpdef set_line(self, index_t line_nb, object values):
raise Exception("Column is read-only")
cdef class OBIDMS_column_seq_multi_elts_writable(OBIDMS_column_seq_multi_elts):
cpdef set_item(self, index_t line_nb, str element_name, str value):
if obi_column_set_obiseq_with_elt_name(self.pointer, line_nb, str2bytes(element_name), str2bytes(value)) < 0:
raise Exception("Problem setting a value in a column")
cpdef set_line(self, index_t line_nb, object values):
cdef str value
for element_name in values :
value = values[element_name]
self.set_item(line_nb, element_name, value)
cpdef close(self):
if obi_truncate_and_close_column(self.pointer) < 0 :
raise Exception("Problem closing a column")

View File

@ -163,10 +163,30 @@ cdef extern from "obidmscolumn_str.h" nogil:
char* value)
const_char_p obi_column_get_obistr_with_elt_name(OBIDMS_column_p column,
index_t line_nb,
const_char_p element_name)
index_t line_nb,
const_char_p element_name)
const_char_p obi_column_get_obistr_with_elt_idx(OBIDMS_column_p column,
index_t line_nb,
index_t element_idx)
index_t line_nb,
index_t element_idx)
cdef extern from "obidmscolumn_seq.h" nogil:
int obi_column_set_obiseq_with_elt_name(OBIDMS_column_p column,
index_t line_nb,
const_char_p element_name,
char* value)
int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column,
index_t line_nb,
index_t element_idx,
char* value)
const_char_p obi_column_get_obiseq_with_elt_name(OBIDMS_column_p column,
index_t line_nb,
const_char_p element_name)
const_char_p obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column,
index_t line_nb,
index_t element_idx)

View File

@ -10,9 +10,9 @@ from obitools3.obidms._obidms import OBIDMS
LINE_COUNT_FOR_TEST_COLUMN = 10000 # TODO randomize?
SMALLER_LINE_COUNT_FOR_TEST_COLUMN = 1000 # TODO randomize?
NB_ELEMENTS_PER_LINE = 20 # TODO randomize?
NB_ELEMENTS_PER_LINE = 10 # TODO randomize?
DMS_NAME = "unit_test_dms"
DATA_TYPES = ['OBI_INT', 'OBI_FLOAT', 'OBI_BOOL', 'OBI_CHAR', 'OBI_IDX']
DATA_TYPES = ['OBI_INT', 'OBI_FLOAT', 'OBI_BOOL', 'OBI_CHAR', 'OBI_STR', 'OBI_SEQ']
def create_test_obidms():
@ -58,12 +58,15 @@ def random_obivalue(data_type):
elif data_type == "OBI_BOOL" :
return randint(0,1)
elif data_type == "OBI_CHAR" :
nucs = 'atgc'
return nucs[randint(0,3)]
elif data_type == "OBI_IDX" :
length = randint(1,500)
return choice(string.ascii_lowercase)
elif data_type == "OBI_STR" :
length = randint(1,200)
randoms = ''.join(choice(string.ascii_lowercase) for i in range(length))
return randoms
elif data_type == "OBI_SEQ" :
length = randint(1,200)
randoms = ''.join(choice("atgc") for i in range(length))
return randoms
class OBIDMS_Column_TestCase(unittest.TestCase):
def tearDown(self):
@ -255,6 +258,30 @@ class OBIDMS_Column_OBI_STR_multiple_elements_TestCase(OBIDMS_Column_multiple_el
self.data_type_code,
multiple_elements_per_line=True)
class OBIDMS_Column_OBI_SEQ_TestCase(OBIDMS_Column_TestCase):
def setUp(self):
self.data_type_code = 6
self.dms, \
self.dms_name, \
self.dms_dir_name = create_test_obidms()
self.col, \
self.col_name, \
self.data_type_str = create_test_column(self.dms,
self.data_type_code)
class OBIDMS_Column_OBI_SEQ_multiple_elements_TestCase(OBIDMS_Column_multiple_elements_TestCase):
def setUp(self):
self.data_type_code = 6
self.dms, \
self.dms_name, \
self.dms_dir_name = create_test_obidms()
self.col, \
self.col_name, \
self.elts_names, \
self.data_type_str = create_test_column(self.dms,
self.data_type_code,
multiple_elements_per_line=True)
if __name__ == '__main__':
unittest.main(verbosity=2, defaultTest=["OBIDMS_Column_OBI_INT_TestCase",
@ -266,6 +293,8 @@ if __name__ == '__main__':
"OBIDMS_Column_OBI_CHAR_TestCase",
"OBIDMS_Column_OBI_CHAR_multiple_elements_TestCase",
"OBIDMS_Column_OBI_STR_TestCase",
"OBIDMS_Column_OBI_STR_multiple_elements_TestCase"])
"OBIDMS_Column_OBI_STR_multiple_elements_TestCase",
"OBIDMS_Column_OBI_SEQ_TestCase",
"OBIDMS_Column_OBI_SEQ_multiple_elements_TestCase"])

180
src/encode.c Normal file
View File

@ -0,0 +1,180 @@
/****************************************************************************
* Encoding functions *
****************************************************************************/
/**
* @file encode.c
* @author Celine Mercier
* @date November 18th 2015
* @brief Functions encoding DNA sequences.
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include "encode.h"
#include "obiarray.h"
#include "obidebug.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
// TODO: endianness problem?
bool only_ATGC(char* seq)
{
char* c = seq;
while (*c)
{
if (!((*c == 'A') || \
(*c == 'T') || \
(*c == 'G') || \
(*c == 'C') || \
(*c == 'a') || \
(*c == 't') || \
(*c == 'g') || \
(*c == 'c')))
{
return 0;
}
else
{
c++;
}
}
return 1;
}
byte_t* encode_seq_on_2_bits(char* seq, int32_t length) // TODO shift = 2
{
byte_t* seq_b;
uint8_t shift;
int32_t length_b;
int32_t i;
// fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>Encoding sequence %s", seq);
length_b = ceil((double) length / (double) 4.0);
// fprintf(stderr, "\nLength: %d", length_b);
seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));
memset(seq_b, 0, length_b);
for (i=0; i<length; i++)
{
shift = 6 - 2*(i%4);
// fprintf(stderr, "\nshift: %u", shift);
switch (seq[i])
{
case 'a':
case 'A':
seq_b[i/4] |= NUC_A << shift;
// fprintf(stderr, "\nIn byte %d, writing A:", i/4);
// print_bits(seq_b, length_b);
break;
case 'c':
case 'C':
seq_b[i/4] |= NUC_C << shift;
// fprintf(stderr, "\nIn byte %d, writing C:", i/4);
// print_bits(seq_b, length_b);
break;
case 'g':
case 'G':
seq_b[i/4] |= NUC_G << shift;
// fprintf(stderr, "\nIn byte %d, writing G:", i/4);
// print_bits(seq_b, length_b);
break;
case 't':
case 'T':
seq_b[i/4] |= NUC_T << shift;
// fprintf(stderr, "\nIn byte %d, writing T:", i/4);
// print_bits(seq_b, length_b);
break;
default:
obidebug(1, "\nInvalid nucleotide base when encoding (not [atgcATGC])");
return NULL;
}
}
// fprintf(stderr, "\n>>>>>>>>>Encoded:");
// print_bits(seq_b, length_b);
return seq_b;
}
char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
{
char* seq;
int32_t i;
uint8_t shift;
uint8_t mask;
uint8_t nuc;
seq = (char*) malloc((length_seq+1) * sizeof(char));
for (i=0; i<length_seq; i++)
{
shift = 6 - 2*(i % 4);
mask = NUC_MASK << shift;
nuc = (seq_b[i/4] & mask) >> shift;
switch (nuc)
{
case NUC_A:
seq[i] = 'a';
break;
case NUC_C:
seq[i] = 'c';
break;
case NUC_G:
seq[i] = 'g';
break;
case NUC_T:
seq[i] = 't';
break;
default:
obidebug(1, "\nInvalid nucleotide base when decoding");
return NULL;
}
}
seq[length_seq] = '\0';
return seq;
}
////////// FOR DEBUGGING ///////////
// little endian
void print_bits(void* ptr, int32_t size)
{
uint8_t* b = (uint8_t*) ptr;
uint8_t byte;
int32_t i, j;
fprintf(stderr, "\n");
for (i=0;i<size;i++)
{
for (j=7;j>=0;j--)
{
byte = b[i] & (1<<j);
byte >>= j;
fprintf(stderr, "%u", byte);
}
fprintf(stderr, " ");
}
fprintf(stderr, "\n");
}

95
src/encode.h Normal file
View File

@ -0,0 +1,95 @@
/****************************************************************************
* Encoding header file *
****************************************************************************/
/**
* @file encode.h
* @author Celine Mercier
* @date November 18th 2015
* @brief Header file for encoding DNA sequences.
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "obiarray.h"
#define NUC_MASK 0x3 /**< Binary: 11 to use when decoding */
/**
* @brief enum for the 2-bits codes for each of the 4 nucleotides.
*/
enum
{
NUC_A = 0x0, /* binary: 00 */
NUC_C = 0x1, /* binary: 01 */
NUC_G = 0x2, /* binary: 10 */
NUC_T = 0x3, /* binary: 11 */
};
/**
* @brief Checks if there are only 'atgcATGC' characters in a
* character string.
*
* @param seq The sequence to check.
*
* @returns A boolean value indicating if there are only
* 'atgcATGC' characters in a character string.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
bool only_ATGC(char* seq);
/**
* @brief Encodes a DNA sequence with each nucleotide coded on 2 bits.
*
* A or a : 00
* C or c : 01
* T or t : 10
* G or g : 11
*
* @warning The DNA sequence must contain only 'atgcATGC' characters.
*
* @param seq The sequence to encode.
* @param length The length of the sequence to encode.
*
* @returns The encoded sequence.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
byte_t* encode_seq_on_2_bits(char* seq, int32_t length);
/**
* @brief Decodes a DNA sequence that is coded with each nucleotide on 2 bits.
*
* A or a : 00
* C or c : 01
* T or t : 10
* G or g : 11
*
* @param seq The sequence to decode.
* @param length_seq The initial length of the sequence before it was encoded.
*
* @returns The decoded sequence ended with '\0'.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq);
////////// FOR DEBUGGING ///////////
// little endian
void print_bits(void* ptr, int32_t length);

View File

@ -24,6 +24,7 @@
#include "obitypes.h"
#include "obidebug.h"
#include "private_at_functions.h"
#include "encode.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
@ -446,6 +447,8 @@ int array_compare(byte_t* value_1, byte_t* value_2)
uint8_t size_2;
int32_t len_1;
int32_t len_2;
int32_t ini_len_1;
int32_t ini_len_2;
int32_t b;
//obidebug(1, "\nCOMPARING 1=%d,%.*s; 2=%d,%.*s", *((int32_t*)(value_1+1)), *((int32_t*)(value_1+1)), value_1+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_2+1)), *((int32_t*)(value_2+1)), value_2+BYTE_ARRAY_HEADER_SIZE);
@ -462,6 +465,15 @@ int array_compare(byte_t* value_1, byte_t* value_2)
if (len_1 != len_2)
return (len_1 - len_2);
if (size_1 != 8)
{
ini_len_1 = *((int32_t*)(value_1+5));
ini_len_2 = *((int32_t*)(value_2+5));
if (ini_len_1 != ini_len_2)
return (ini_len_1 - ini_len_2);
}
b = BYTE_ARRAY_HEADER_SIZE;
comp = 0;
while (!comp && (b < len_1+BYTE_ARRAY_HEADER_SIZE))
@ -475,7 +487,7 @@ int array_compare(byte_t* value_1, byte_t* value_2)
size_t array_sizeof(byte_t* value)
{
return (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1)) + 1);
return (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1)));
}
@ -995,6 +1007,8 @@ index_t obi_array_add(OBIDMS_array_p array, byte_t* value)
(array->first)[idx] = data_size_used;
// Store the value itself at the end of the data
// fprintf(stderr, "\nMEMCOPYING TO STORE, with size %ld :", value_size);
// printBits(value_size, value);
memcpy((((array->data)->data)+data_size_used), value, value_size);
// Update the data size
@ -1079,8 +1093,8 @@ byte_t* obi_str_to_obibytes(char* value)
uint8_t size;
size = 8;
length = strlen(value);
value_b = (byte_t*) malloc(length + BYTE_ARRAY_HEADER_SIZE + 1);
length = strlen(value) + 1; // +1 to store \0 at the end (makes retrieving faster)
value_b = (byte_t*) malloc(BYTE_ARRAY_HEADER_SIZE + length);
if (value_b == NULL)
{
obi_set_errno(OBI_ARRAY_ERROR);
@ -1090,7 +1104,8 @@ byte_t* obi_str_to_obibytes(char* value)
*(value_b) = size;
*((int32_t*)(value_b+1)) = length;
*((int32_t*)(value_b+1)) = length; // TODO comment
*((int32_t*)(value_b+5)) = length;
strcpy(value_b+BYTE_ARRAY_HEADER_SIZE, value);
@ -1107,3 +1122,73 @@ const char* obi_obibytes_to_str(byte_t* value_b)
return value;
}
byte_t* obi_seq_to_obibytes(char* seq)
{
byte_t* value_b;
int32_t length; // length of the value (without the header) in bytes
uint8_t size; // size of one element in bits
int32_t seq_length;
byte_t* encoded_seq;
// Check if just ATGC and set size of a nucleotide accordingly (2 bits or 4 bits)
//fprintf(stderr, "\nonly ATGC = %d", only_ATGC(seq));
if (only_ATGC(seq))
size = 2;
else
size = 4;
// Set length
seq_length = strlen(seq);
if (size == 2)
length = ceil((double) seq_length / (double) 4.0);
else // size == 4
length = ceil((double) seq_length / (double) 2.0);
// Encode
if (size == 2)
encoded_seq = encode_seq_on_2_bits(seq, seq_length);
else // size == 4
return NULL;
// encoded_seq = encode_seq_on_4_bits(seq, seq_length);
// Set the values in the byte array
value_b = (byte_t*) malloc(BYTE_ARRAY_HEADER_SIZE + length);
*(value_b) = size;
*((int32_t*)(value_b+1)) = length;
*((int32_t*)(value_b+5)) = seq_length;
//fprintf(stderr, "\nstored seq length : %d\n", *((int32_t*)(value_b+5)));
memcpy(value_b+BYTE_ARRAY_HEADER_SIZE, encoded_seq, length);
//obidebug(1, "\n\nENCODED VALUE_B = ");
//printBits(((*((int32_t*)(value_b+1)))+BYTE_ARRAY_HEADER_SIZE), value_b);
free(encoded_seq);
return value_b;
}
const char* obi_obibytes_to_seq(byte_t* value_b)
{
const char* value;
uint8_t size; // size of one element in bits
//obidebug(1, "\n\nGONNA DECODE VALUE_B = ");
//printBits(((*((int32_t*)(value_b+1)))+BYTE_ARRAY_HEADER_SIZE), value_b);
size = *(value_b);
// Decode
if (size == 2)
value = decode_seq_on_2_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5)));
else
return NULL;
// value = decode_seq_on_4_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5)));
return value;
}

View File

@ -29,7 +29,7 @@
*/
#define ARRAY_GROWTH_FACTOR (2) /**< The growth factor when an array is enlarged.
*/
#define BYTE_ARRAY_HEADER_SIZE (5) /**< The size of the header of a byte array.
#define BYTE_ARRAY_HEADER_SIZE (9) /**< The size of the header of a byte array.
*/
@ -284,5 +284,34 @@ byte_t* obi_str_to_obibytes(char* value);
const char* obi_obibytes_to_str(byte_t* value_b);
/**
* @brief Converts a DNA sequence to a byte array with a header.
*
* @warning The byte array must be freed by the caller.
*
* @param value The DNA sequence to convert.
*
* @returns A pointer to the byte array created.
* @retval NULL if an error occurred.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
byte_t* obi_seq_to_obibytes(char* seq);
/**
* @brief Converts a byte array to a DNA sequence.
*
* @param value_b The byte array to convert.
*
* @returns A pointer to the DNA sequence contained in the byte array.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
const char* obi_obibytes_to_seq(byte_t* value_b);
#endif /* OBIARRAY_H_ */

View File

@ -533,12 +533,12 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
obidebug(1, "\nCan't create column because of empty column name");
return NULL;
}
if ((data_type < 1) || (data_type > 5))
if ((data_type < 1) || (data_type > 6))
{
obidebug(1, "\nCan't create column because of invalid data type");
return NULL;
}
if ((data_type == 5) && (array_name == NULL))
if (((data_type == 5) || (data_type == 6)) && (array_name == NULL))
{
obidebug(1, "\nCan't create column because of empty array name");
return NULL;
@ -701,8 +701,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
if (comments != NULL)
strncpy(header->comments, comments, COMMENTS_MAX_LENGTH);
// If the data type is OBI_IDX, the associated obi_array is opened or created
if (data_type == 5)
// If the data type is OBI_STR or OBI_SEQ, the associated obi_array is opened or created
if ((data_type == 5) || (data_type == 6))
{
array = obi_array(dms, array_name);
if (array == NULL)
@ -838,8 +838,8 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms,
column->writable = false;
// If the data type is OBI_IDX, the associated obi_array is opened or created
if ((column->header)->data_type == 5)
// If the data type is OBI_STR or OBI_SEQ, the associated obi_array is opened or created
if (((column->header)->data_type == 5) || ((column->header)->data_type == 6))
{
array = obi_array(dms, (column->header)->array_name);
if (array == NULL)
@ -1175,7 +1175,8 @@ void obi_ini_to_NA_values(OBIDMS_column_p column,
}
break;
case OBI_IDX: for (i=start;i<end;i++)
case OBI_STR:
case OBI_SEQ: for (i=start;i<end;i++)
{
*(((index_t*) (column->data)) + i) = OBIIdx_NA;
}

120
src/obidmscolumn_seq.c Normal file
View File

@ -0,0 +1,120 @@
/****************************************************************************
* OBIDMS_column_seq functions *
****************************************************************************/
/**
* @file obidsmcolumn_seq.c
* @author Celine Mercier
* @date November 18th 2015
* @brief Functions handling OBIColumns containing data in the form of indices referring to DNA sequences.
*/
#include <stdlib.h>
#include <stdio.h>
#include "obidmscolumn.h"
#include "obitypes.h"
#include "obierrno.h"
#include "obidebug.h"
#include "obiarray.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
/**********************************************************************
*
* D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S
*
**********************************************************************/
int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx, char* value)
{
byte_t* value_b;
index_t idx;
// Check that the line number is not greater than the maximum allowed
if (line_nb >= MAXIMUM_LINE_COUNT)
{
obi_set_errno(OBICOL_UNKNOWN_ERROR);
obidebug(1, "\nError trying to set a value at a line number greater than the maximum allowed");
return -1;
}
// Check if the file needs to be enlarged
while ((line_nb+1) > (column->header)->line_count)
{
// Enlarge the file
if (obi_enlarge_column(column) < 0)
return -1;
}
// Update lines used
if ((line_nb+1) > (column->header)->lines_used)
(column->header)->lines_used = line_nb+1;
// Encode the value on a byte array with a header
value_b = obi_seq_to_obibytes(value);
if (value_b == NULL)
return -1;
// Add in the obiarray
idx = obi_array_add(column->array, value_b);
if (idx == -1)
return -1;
// Add the value's index in the column
*(((index_t*) (column->data)) + (line_nb * ((column->header)->nb_elements_per_line)) + element_idx) = idx;
free(value_b);
return 0;
}
const char* obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx)
{
index_t idx;
byte_t* value_b;
if ((line_nb+1) > (column->header)->lines_used)
{
obi_set_errno(OBICOL_UNKNOWN_ERROR);
obidebug(1, "\nError trying to get a value that is beyond the current number of lines used");
return "\0"; // TODO
}
idx = *(((index_t*) (column->data)) + (line_nb * ((column->header)->nb_elements_per_line)) + element_idx);
// Check NA
if (idx == OBIIdx_NA)
return "\0"; // TODO
value_b = obi_array_get(column->array, idx);
return obi_obibytes_to_seq(value_b);
}
int obi_column_set_obiseq_with_elt_name(OBIDMS_column_p column, index_t line_nb, const char* element_name, char* value)
{
index_t element_idx;
element_idx = obi_column_get_element_index_from_name(column, element_name);
if (element_idx == OBIIdx_NA)
return -1;
if (obi_column_set_obiseq_with_elt_idx(column, line_nb, element_idx, value) < 0)
return -1;
return 0;
}
const char* obi_column_get_obiseq_with_elt_name(OBIDMS_column_p column, index_t line_nb, const char* element_name)
{
index_t element_idx;
element_idx = obi_column_get_element_index_from_name(column, element_name);
if (element_idx == OBIIdx_NA)
return "\0";
return obi_column_get_obiseq_with_elt_idx(column, line_nb, element_idx);
}

101
src/obidmscolumn_seq.h Normal file
View File

@ -0,0 +1,101 @@
/****************************************************************************
* OBIDMS_column_seq header file *
****************************************************************************/
/**
* @file obidsmcolumn_seq.h
* @author Celine Mercier
* @date Novemeber 18th 2015
* @brief Header file for the functions handling OBIColumns containing data in the form of indices referring to DNA sequences.
*/
#ifndef OBIDMSCOLUMN_SEQ_H_
#define OBIDMSCOLUMN_SEQ_H_
#include <stdlib.h>
#include <stdio.h>
#include "obidmscolumn.h"
#include "obitypes.h"
/**
* @brief Sets a value in an OBIDMS column containing data in the form of indices referring
* to DNA sequences in an obiarray, using the index of the element in the line.
*
* @warning Pointers returned by obi_open_column() don't allow writing.
*
* @param column A pointer as returned by obi_create_column() or obi_clone_column().
* @param line_nb The number of the line where the value should be set.
* @param element_idx The index of the element that should be set in the line.
* @param value The value that should be set.
*
* @returns An integer value indicating the success of the operation.
* @retval 0 on success.
* @retval -1 if an error occurred.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx, char* value);
/**
* @brief Recovers a value in an OBIDMS column containing data in the form of indices referring
* to DNA sequences in an obiarray, using the index of the element in the line.
*
* @param column A pointer as returned by obi_create_column().
* @param line_nb The number of the line where the value should be recovered.
* @param element_idx The index of the element that should be recovered in the line.
*
* @returns The recovered value.
* @retval '\0' the NA value of the type if an error occurred and obi_errno is set.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
const char* obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx);
/**
* @brief Sets a value in an OBIDMS column containing data in the form of indices referring
* to DNA sequences in an obiarray, using the name of the element in the line.
*
* @warning Pointers returned by obi_open_column() don't allow writing.
*
* @param column A pointer as returned by obi_create_column() or obi_clone_column().
* @param line_nb The number of the line where the value should be set.
* @param element_name The name of the element that should be set in the line.
* @param value The value that should be set.
*
* @returns An integer value indicating the success of the operation.
* @retval 0 on success.
* @retval -1 if an error occurred.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_column_set_obiseq_with_elt_name(OBIDMS_column_p column, index_t line_nb, const char* element_name, char* value);
/**
* @brief Recovers a value in an OBIDMS column containing data in the form of indices referring
* to DNA sequences in an obiarray, using the name of the element in the line.
*
* @param column A pointer as returned by obi_create_column() or obi_clone_column().
* @param line_nb The number of the line where the value should be recovered.
* @param element_name The name of the element that should be recovered in the line.
*
* @returns The recovered value.
* @retval '\0' the NA value of the type if an error occurred and obi_errno is set.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
const char* obi_column_get_obiseq_with_elt_name(OBIDMS_column_p column, index_t line_nb, const char* element_name);
#endif /* OBIDMSCOLUMN_SEQ_H_ */

View File

@ -97,5 +97,5 @@ int obi_column_set_obistr_with_elt_name(OBIDMS_column_p column, index_t line_nb,
const char* obi_column_get_obistr_with_elt_name(OBIDMS_column_p column, index_t line_nb, const char* element_name);
#endif /* OBIDMSCOLUMN_IDX_H_ */
#endif /* OBIDMSCOLUMN_STR_H_ */

View File

@ -40,7 +40,10 @@ size_t obi_sizeof(OBIType_t type)
case OBI_CHAR: size = sizeof(obichar_t);
break;
case OBI_IDX: size = sizeof(index_t);
case OBI_STR: size = sizeof(index_t);
break;
case OBI_SEQ: size = sizeof(index_t);
break;
default: size = 0;
@ -90,7 +93,10 @@ char* name_data_type(int data_type)
case OBI_CHAR: name = strdup("OBI_CHAR");
break;
case OBI_IDX: name = strdup("OBI_IDX");
case OBI_STR: name = strdup("OBI_STR");
break;
case OBI_SEQ: name = strdup("OBI_SEQ");
break;
}

View File

@ -44,7 +44,8 @@ typedef enum OBIType {
OBI_FLOAT, /**< a floating value (C type : double) */
OBI_BOOL, /**< a boolean true/false value, see obibool_t enum */
OBI_CHAR, /**< a character (C type : char) */
OBI_IDX /**< an index in a data structure (C type : int64_t) */
OBI_STR, /**< an index in a data structure (C type : int64_t) referring to a character string*/
OBI_SEQ /**< an index in a data structure (C type : int64_t) referring to a DNA sequence*/
} OBIType_t, *OBIType_p;
@ -52,7 +53,7 @@ typedef int64_t index_t;
typedef int32_t obiint_t;
typedef double obifloat_t;
typedef char obichar_t;
// TODO same for obistr_t and obiseq_t ?
/**
* @brief Union used to compute the NA value of the OBI_FLOAT OBIType.