diff --git a/python/obitools3/obidms/_obidms.cfiles b/python/obitools3/obidms/_obidms.cfiles index aa46c53..a141e81 100644 --- a/python/obitools3/obidms/_obidms.cfiles +++ b/python/obitools3/obidms/_obidms.cfiles @@ -14,3 +14,5 @@ ../../../src/private_at_functions.c ../../../src/obiarray.h ../../../src/obiarray.c +../../../src/encode.h +../../../src/encode.c \ No newline at end of file diff --git a/python/obitools3/obidms/_obidms.pyx b/python/obitools3/obidms/_obidms.pyx index db16e98..ae35f80 100644 --- a/python/obitools3/obidms/_obidms.pyx +++ b/python/obitools3/obidms/_obidms.pyx @@ -47,6 +47,11 @@ from ._obidmscolumn_str cimport OBIDMS_column_str, \ OBIDMS_column_str_multi_elts, \ OBIDMS_column_str_multi_elts_writable +from ._obidmscolumn_seq cimport OBIDMS_column_seq, \ + OBIDMS_column_seq_writable, \ + OBIDMS_column_seq_multi_elts, \ + OBIDMS_column_seq_multi_elts_writable + cdef class OBIDMS : @@ -215,6 +220,17 @@ cdef class OBIDMS : subclass = OBIDMS_column_str else : subclass = OBIDMS_column_str_multi_elts + elif data_type == 6 : + if (create or clone) : + if nb_elements_per_line == 1 : + subclass = OBIDMS_column_seq_writable + else : + subclass = OBIDMS_column_seq_multi_elts_writable + else : + if nb_elements_per_line == 1 : + subclass = OBIDMS_column_seq + else : + subclass = OBIDMS_column_seq_multi_elts else : raise Exception("Problem with the data type") @@ -238,7 +254,7 @@ cdef class OBIDMS_column : bint create, bint clone, bint clone_data, obiversion_t version_number, - OBIType_t type, + OBIType_t type, # There's a problem with this with the OBI_IDX columns as there are 2 subtypes index_t nb_lines, index_t nb_elements_per_line, list elements_names, diff --git a/python/obitools3/obidms/_obidmscolumn_seq.cfiles b/python/obitools3/obidms/_obidmscolumn_seq.cfiles new file mode 100644 index 0000000..b8a9119 --- /dev/null +++ b/python/obitools3/obidms/_obidmscolumn_seq.cfiles @@ -0,0 +1,18 @@ +../../../src/obidmscolumn_seq.c +../../../src/obidmscolumn_seq.h +../../../src/obidmscolumn.h +../../../src/obidmscolumn.c +../../../src/obidmscolumndir.h +../../../src/obidmscolumndir.c +../../../src/obidms.h +../../../src/obidms.c +../../../src/obierrno.h +../../../src/obierrno.c +../../../src/obilittlebigman.h +../../../src/obilittlebigman.c +../../../src/obitypes.h +../../../src/obitypes.c +../../../src/private_at_functions.h +../../../src/private_at_functions.c +../../../src/obiarray.h +../../../src/obiarray.c diff --git a/python/obitools3/obidms/_obidmscolumn_seq.pxd b/python/obitools3/obidms/_obidmscolumn_seq.pxd new file mode 100644 index 0000000..e06654e --- /dev/null +++ b/python/obitools3/obidms/_obidmscolumn_seq.pxd @@ -0,0 +1,25 @@ +#cython: language_level=3 + +from .capi.obitypes cimport index_t +from ._obidms cimport OBIDMS_column + + +cdef class OBIDMS_column_seq(OBIDMS_column): + cpdef object get_line(self, index_t line_nb) + cpdef set_line(self, index_t line_nb, object value) + cpdef close(self) + +cdef class OBIDMS_column_seq_writable(OBIDMS_column_seq): + cpdef set_line(self, index_t line_nb, object value) + cpdef close(self) + +cdef class OBIDMS_column_seq_multi_elts(OBIDMS_column_seq): + cpdef object get_item(self, index_t line_nb, str element_name) + cpdef object get_line(self, index_t line_nb) + cpdef set_item(self, index_t line_nb, str element_name, str value) + cpdef set_line(self, index_t line_nb, object values) + +cdef class OBIDMS_column_seq_multi_elts_writable(OBIDMS_column_seq_multi_elts): + cpdef set_item(self, index_t line_nb, str element_name, str value) + cpdef set_line(self, index_t line_nb, object values) + cpdef close(self) diff --git a/python/obitools3/obidms/_obidmscolumn_seq.pyx b/python/obitools3/obidms/_obidmscolumn_seq.pyx new file mode 100644 index 0000000..4e71f35 --- /dev/null +++ b/python/obitools3/obidms/_obidmscolumn_seq.pyx @@ -0,0 +1,103 @@ +#cython: language_level=3 + +from .capi.obidmscolumn cimport obi_close_column,\ + obi_truncate_and_close_column, \ + obi_column_get_obiseq_with_elt_name, \ + obi_column_get_obiseq_with_elt_idx, \ + obi_column_set_obiseq_with_elt_name, \ + obi_column_set_obiseq_with_elt_idx +from .capi.obierrno cimport obi_errno +from .capi.obitypes cimport OBIIdx_NA, const_char_p + +from obitools3.utils cimport str2bytes, bytes2str + + +cdef class OBIDMS_column_seq(OBIDMS_column): + + cpdef object get_line(self, index_t line_nb): + cdef bytes value + cdef object result + value = obi_column_get_obiseq_with_elt_idx(self.pointer, line_nb, 0) + if obi_errno > 0 : + raise IndexError(line_nb) + if value == OBIIdx_NA : + result = None + else : + result = bytes2str(value) + return result + + cpdef set_line(self, index_t line_nb, object value): + raise Exception("Column is read-only") + + cpdef close(self): + if obi_close_column(self.pointer) < 0 : + raise Exception("Problem closing a column") + + +cdef class OBIDMS_column_seq_writable(OBIDMS_column_seq): + + cpdef set_line(self, index_t line_nb, object value): + if obi_column_set_obiseq_with_elt_idx(self.pointer, line_nb, 0, str2bytes(value)) < 0: + raise Exception("Problem setting a value in a column") + + cpdef close(self): + if obi_truncate_and_close_column(self.pointer) < 0 : + raise Exception("Problem closing a column") + + +cdef class OBIDMS_column_seq_multi_elts(OBIDMS_column_seq): + + cpdef object get_item(self, index_t line_nb, str element_name): + cdef bytes value + cdef object result + value = obi_column_get_obiseq_with_elt_name(self.pointer, line_nb, str2bytes(element_name)) + if obi_errno > 0 : + raise IndexError(line_nb, element_name) + if value == OBIIdx_NA : + result = None + else : + result = bytes2str(value) + return result + + cpdef object get_line(self, index_t line_nb) : + cdef bytes value + cdef object result + cdef index_t i + cdef bint all_NA + result = {} + all_NA = True + for i in range(self.nb_elements_per_line) : + value = obi_column_get_obiseq_with_elt_idx(self.pointer, line_nb, i) + if obi_errno > 0 : + raise IndexError(line_nb) + result[self.elements_names[i]] = bytes2str(value) + if all_NA and (value != OBIIdx_NA) : + all_NA = False + if all_NA : + result = None + return result + + cpdef set_item(self, index_t line_nb, str element_name, str value): + raise Exception("Column is read-only") + + cpdef set_line(self, index_t line_nb, object values): + raise Exception("Column is read-only") + + +cdef class OBIDMS_column_seq_multi_elts_writable(OBIDMS_column_seq_multi_elts): + + cpdef set_item(self, index_t line_nb, str element_name, str value): + if obi_column_set_obiseq_with_elt_name(self.pointer, line_nb, str2bytes(element_name), str2bytes(value)) < 0: + raise Exception("Problem setting a value in a column") + + cpdef set_line(self, index_t line_nb, object values): + cdef str value + for element_name in values : + value = values[element_name] + self.set_item(line_nb, element_name, value) + + cpdef close(self): + if obi_truncate_and_close_column(self.pointer) < 0 : + raise Exception("Problem closing a column") + + \ No newline at end of file diff --git a/python/obitools3/obidms/capi/obidmscolumn.pxd b/python/obitools3/obidms/capi/obidmscolumn.pxd index 196f310..ab16a6b 100644 --- a/python/obitools3/obidms/capi/obidmscolumn.pxd +++ b/python/obitools3/obidms/capi/obidmscolumn.pxd @@ -163,10 +163,30 @@ cdef extern from "obidmscolumn_str.h" nogil: char* value) const_char_p obi_column_get_obistr_with_elt_name(OBIDMS_column_p column, - index_t line_nb, - const_char_p element_name) + index_t line_nb, + const_char_p element_name) const_char_p obi_column_get_obistr_with_elt_idx(OBIDMS_column_p column, - index_t line_nb, - index_t element_idx) + index_t line_nb, + index_t element_idx) + +cdef extern from "obidmscolumn_seq.h" nogil: + + int obi_column_set_obiseq_with_elt_name(OBIDMS_column_p column, + index_t line_nb, + const_char_p element_name, + char* value) + + int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, + index_t line_nb, + index_t element_idx, + char* value) + + const_char_p obi_column_get_obiseq_with_elt_name(OBIDMS_column_p column, + index_t line_nb, + const_char_p element_name) + + const_char_p obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column, + index_t line_nb, + index_t element_idx) diff --git a/python/obitools3/unit_tests.py b/python/obitools3/unit_tests.py index 1be696e..93112d2 100644 --- a/python/obitools3/unit_tests.py +++ b/python/obitools3/unit_tests.py @@ -10,9 +10,9 @@ from obitools3.obidms._obidms import OBIDMS LINE_COUNT_FOR_TEST_COLUMN = 10000 # TODO randomize? SMALLER_LINE_COUNT_FOR_TEST_COLUMN = 1000 # TODO randomize? -NB_ELEMENTS_PER_LINE = 20 # TODO randomize? +NB_ELEMENTS_PER_LINE = 10 # TODO randomize? DMS_NAME = "unit_test_dms" -DATA_TYPES = ['OBI_INT', 'OBI_FLOAT', 'OBI_BOOL', 'OBI_CHAR', 'OBI_IDX'] +DATA_TYPES = ['OBI_INT', 'OBI_FLOAT', 'OBI_BOOL', 'OBI_CHAR', 'OBI_STR', 'OBI_SEQ'] def create_test_obidms(): @@ -58,12 +58,15 @@ def random_obivalue(data_type): elif data_type == "OBI_BOOL" : return randint(0,1) elif data_type == "OBI_CHAR" : - nucs = 'atgc' - return nucs[randint(0,3)] - elif data_type == "OBI_IDX" : - length = randint(1,500) + return choice(string.ascii_lowercase) + elif data_type == "OBI_STR" : + length = randint(1,200) randoms = ''.join(choice(string.ascii_lowercase) for i in range(length)) return randoms + elif data_type == "OBI_SEQ" : + length = randint(1,200) + randoms = ''.join(choice("atgc") for i in range(length)) + return randoms class OBIDMS_Column_TestCase(unittest.TestCase): def tearDown(self): @@ -255,6 +258,30 @@ class OBIDMS_Column_OBI_STR_multiple_elements_TestCase(OBIDMS_Column_multiple_el self.data_type_code, multiple_elements_per_line=True) +class OBIDMS_Column_OBI_SEQ_TestCase(OBIDMS_Column_TestCase): + def setUp(self): + self.data_type_code = 6 + self.dms, \ + self.dms_name, \ + self.dms_dir_name = create_test_obidms() + self.col, \ + self.col_name, \ + self.data_type_str = create_test_column(self.dms, + self.data_type_code) + + +class OBIDMS_Column_OBI_SEQ_multiple_elements_TestCase(OBIDMS_Column_multiple_elements_TestCase): + def setUp(self): + self.data_type_code = 6 + self.dms, \ + self.dms_name, \ + self.dms_dir_name = create_test_obidms() + self.col, \ + self.col_name, \ + self.elts_names, \ + self.data_type_str = create_test_column(self.dms, + self.data_type_code, + multiple_elements_per_line=True) if __name__ == '__main__': unittest.main(verbosity=2, defaultTest=["OBIDMS_Column_OBI_INT_TestCase", @@ -266,6 +293,8 @@ if __name__ == '__main__': "OBIDMS_Column_OBI_CHAR_TestCase", "OBIDMS_Column_OBI_CHAR_multiple_elements_TestCase", "OBIDMS_Column_OBI_STR_TestCase", - "OBIDMS_Column_OBI_STR_multiple_elements_TestCase"]) + "OBIDMS_Column_OBI_STR_multiple_elements_TestCase", + "OBIDMS_Column_OBI_SEQ_TestCase", + "OBIDMS_Column_OBI_SEQ_multiple_elements_TestCase"]) diff --git a/src/encode.c b/src/encode.c new file mode 100644 index 0000000..29f06a1 --- /dev/null +++ b/src/encode.c @@ -0,0 +1,180 @@ +/**************************************************************************** + * Encoding functions * + ****************************************************************************/ + +/** + * @file encode.c + * @author Celine Mercier + * @date November 18th 2015 + * @brief Functions encoding DNA sequences. + */ + + +#include +#include +#include +#include + +#include "encode.h" +#include "obiarray.h" +#include "obidebug.h" + + +#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) + + +// TODO: endianness problem? + + + +bool only_ATGC(char* seq) +{ + char* c = seq; + + while (*c) + { + if (!((*c == 'A') || \ + (*c == 'T') || \ + (*c == 'G') || \ + (*c == 'C') || \ + (*c == 'a') || \ + (*c == 't') || \ + (*c == 'g') || \ + (*c == 'c'))) + { + return 0; + } + else + { + c++; + } + } + return 1; +} + + +byte_t* encode_seq_on_2_bits(char* seq, int32_t length) // TODO shift = 2 +{ + byte_t* seq_b; + uint8_t shift; + int32_t length_b; + int32_t i; + +// fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>Encoding sequence %s", seq); + + length_b = ceil((double) length / (double) 4.0); + +// fprintf(stderr, "\nLength: %d", length_b); + + seq_b = (byte_t*) malloc(length_b * sizeof(byte_t)); + + memset(seq_b, 0, length_b); + + for (i=0; i>>>>>>>>Encoded:"); +// print_bits(seq_b, length_b); + + return seq_b; +} + + +char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq) +{ + char* seq; + int32_t i; + uint8_t shift; + uint8_t mask; + uint8_t nuc; + + seq = (char*) malloc((length_seq+1) * sizeof(char)); + + for (i=0; i> shift; + + switch (nuc) + { + case NUC_A: + seq[i] = 'a'; + break; + case NUC_C: + seq[i] = 'c'; + break; + case NUC_G: + seq[i] = 'g'; + break; + case NUC_T: + seq[i] = 't'; + break; + default: + obidebug(1, "\nInvalid nucleotide base when decoding"); + return NULL; + } + } + + seq[length_seq] = '\0'; + + return seq; +} + + +////////// FOR DEBUGGING /////////// + +// little endian +void print_bits(void* ptr, int32_t size) +{ + uint8_t* b = (uint8_t*) ptr; + uint8_t byte; + int32_t i, j; + + fprintf(stderr, "\n"); + for (i=0;i=0;j--) + { + byte = b[i] & (1<>= j; + fprintf(stderr, "%u", byte); + } + fprintf(stderr, " "); + } + fprintf(stderr, "\n"); +} diff --git a/src/encode.h b/src/encode.h new file mode 100644 index 0000000..5a1bac8 --- /dev/null +++ b/src/encode.h @@ -0,0 +1,95 @@ +/**************************************************************************** + * Encoding header file * + ****************************************************************************/ + +/** + * @file encode.h + * @author Celine Mercier + * @date November 18th 2015 + * @brief Header file for encoding DNA sequences. + */ + + +#include +#include +#include +#include + +#include "obiarray.h" + + +#define NUC_MASK 0x3 /**< Binary: 11 to use when decoding */ + + +/** + * @brief enum for the 2-bits codes for each of the 4 nucleotides. + */ +enum +{ + NUC_A = 0x0, /* binary: 00 */ + NUC_C = 0x1, /* binary: 01 */ + NUC_G = 0x2, /* binary: 10 */ + NUC_T = 0x3, /* binary: 11 */ +}; + + +/** + * @brief Checks if there are only 'atgcATGC' characters in a + * character string. + * + * @param seq The sequence to check. + * + * @returns A boolean value indicating if there are only + * 'atgcATGC' characters in a character string. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +bool only_ATGC(char* seq); + + +/** + * @brief Encodes a DNA sequence with each nucleotide coded on 2 bits. + * + * A or a : 00 + * C or c : 01 + * T or t : 10 + * G or g : 11 + * + * @warning The DNA sequence must contain only 'atgcATGC' characters. + * + * @param seq The sequence to encode. + * @param length The length of the sequence to encode. + * + * @returns The encoded sequence. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +byte_t* encode_seq_on_2_bits(char* seq, int32_t length); + + +/** + * @brief Decodes a DNA sequence that is coded with each nucleotide on 2 bits. + * + * A or a : 00 + * C or c : 01 + * T or t : 10 + * G or g : 11 + * + * @param seq The sequence to decode. + * @param length_seq The initial length of the sequence before it was encoded. + * + * @returns The decoded sequence ended with '\0'. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq); + + +////////// FOR DEBUGGING /////////// + +// little endian +void print_bits(void* ptr, int32_t length); + diff --git a/src/obiarray.c b/src/obiarray.c index b893660..a4882c6 100644 --- a/src/obiarray.c +++ b/src/obiarray.c @@ -24,6 +24,7 @@ #include "obitypes.h" #include "obidebug.h" #include "private_at_functions.h" +#include "encode.h" #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) @@ -446,6 +447,8 @@ int array_compare(byte_t* value_1, byte_t* value_2) uint8_t size_2; int32_t len_1; int32_t len_2; + int32_t ini_len_1; + int32_t ini_len_2; int32_t b; //obidebug(1, "\nCOMPARING 1=%d,%.*s; 2=%d,%.*s", *((int32_t*)(value_1+1)), *((int32_t*)(value_1+1)), value_1+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_2+1)), *((int32_t*)(value_2+1)), value_2+BYTE_ARRAY_HEADER_SIZE); @@ -462,6 +465,15 @@ int array_compare(byte_t* value_1, byte_t* value_2) if (len_1 != len_2) return (len_1 - len_2); + if (size_1 != 8) + { + ini_len_1 = *((int32_t*)(value_1+5)); + ini_len_2 = *((int32_t*)(value_2+5)); + + if (ini_len_1 != ini_len_2) + return (ini_len_1 - ini_len_2); + } + b = BYTE_ARRAY_HEADER_SIZE; comp = 0; while (!comp && (b < len_1+BYTE_ARRAY_HEADER_SIZE)) @@ -475,7 +487,7 @@ int array_compare(byte_t* value_1, byte_t* value_2) size_t array_sizeof(byte_t* value) { - return (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1)) + 1); + return (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1))); } @@ -995,6 +1007,8 @@ index_t obi_array_add(OBIDMS_array_p array, byte_t* value) (array->first)[idx] = data_size_used; // Store the value itself at the end of the data +// fprintf(stderr, "\nMEMCOPYING TO STORE, with size %ld :", value_size); +// printBits(value_size, value); memcpy((((array->data)->data)+data_size_used), value, value_size); // Update the data size @@ -1079,8 +1093,8 @@ byte_t* obi_str_to_obibytes(char* value) uint8_t size; size = 8; - length = strlen(value); - value_b = (byte_t*) malloc(length + BYTE_ARRAY_HEADER_SIZE + 1); + length = strlen(value) + 1; // +1 to store \0 at the end (makes retrieving faster) + value_b = (byte_t*) malloc(BYTE_ARRAY_HEADER_SIZE + length); if (value_b == NULL) { obi_set_errno(OBI_ARRAY_ERROR); @@ -1090,7 +1104,8 @@ byte_t* obi_str_to_obibytes(char* value) *(value_b) = size; - *((int32_t*)(value_b+1)) = length; + *((int32_t*)(value_b+1)) = length; // TODO comment + *((int32_t*)(value_b+5)) = length; strcpy(value_b+BYTE_ARRAY_HEADER_SIZE, value); @@ -1107,3 +1122,73 @@ const char* obi_obibytes_to_str(byte_t* value_b) return value; } + +byte_t* obi_seq_to_obibytes(char* seq) +{ + byte_t* value_b; + int32_t length; // length of the value (without the header) in bytes + uint8_t size; // size of one element in bits + int32_t seq_length; + byte_t* encoded_seq; + + // Check if just ATGC and set size of a nucleotide accordingly (2 bits or 4 bits) + //fprintf(stderr, "\nonly ATGC = %d", only_ATGC(seq)); + if (only_ATGC(seq)) + size = 2; + else + size = 4; + + // Set length + seq_length = strlen(seq); + if (size == 2) + length = ceil((double) seq_length / (double) 4.0); + else // size == 4 + length = ceil((double) seq_length / (double) 2.0); + + // Encode + if (size == 2) + encoded_seq = encode_seq_on_2_bits(seq, seq_length); + else // size == 4 + return NULL; + // encoded_seq = encode_seq_on_4_bits(seq, seq_length); + + // Set the values in the byte array + value_b = (byte_t*) malloc(BYTE_ARRAY_HEADER_SIZE + length); + + *(value_b) = size; + *((int32_t*)(value_b+1)) = length; + *((int32_t*)(value_b+5)) = seq_length; + + //fprintf(stderr, "\nstored seq length : %d\n", *((int32_t*)(value_b+5))); + + memcpy(value_b+BYTE_ARRAY_HEADER_SIZE, encoded_seq, length); + + //obidebug(1, "\n\nENCODED VALUE_B = "); + //printBits(((*((int32_t*)(value_b+1)))+BYTE_ARRAY_HEADER_SIZE), value_b); + + free(encoded_seq); + + return value_b; +} + + +const char* obi_obibytes_to_seq(byte_t* value_b) +{ + const char* value; + uint8_t size; // size of one element in bits + + //obidebug(1, "\n\nGONNA DECODE VALUE_B = "); + //printBits(((*((int32_t*)(value_b+1)))+BYTE_ARRAY_HEADER_SIZE), value_b); + + size = *(value_b); + + // Decode + if (size == 2) + value = decode_seq_on_2_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5))); + else + return NULL; +// value = decode_seq_on_4_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5))); + + return value; +} + diff --git a/src/obiarray.h b/src/obiarray.h index 80f7276..cfcaf12 100644 --- a/src/obiarray.h +++ b/src/obiarray.h @@ -29,7 +29,7 @@ */ #define ARRAY_GROWTH_FACTOR (2) /**< The growth factor when an array is enlarged. */ -#define BYTE_ARRAY_HEADER_SIZE (5) /**< The size of the header of a byte array. +#define BYTE_ARRAY_HEADER_SIZE (9) /**< The size of the header of a byte array. */ @@ -284,5 +284,34 @@ byte_t* obi_str_to_obibytes(char* value); const char* obi_obibytes_to_str(byte_t* value_b); +/** + * @brief Converts a DNA sequence to a byte array with a header. + * + * @warning The byte array must be freed by the caller. + * + * @param value The DNA sequence to convert. + * + * @returns A pointer to the byte array created. + * @retval NULL if an error occurred. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +byte_t* obi_seq_to_obibytes(char* seq); + + +/** + * @brief Converts a byte array to a DNA sequence. + * + * @param value_b The byte array to convert. + * + * @returns A pointer to the DNA sequence contained in the byte array. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +const char* obi_obibytes_to_seq(byte_t* value_b); + + #endif /* OBIARRAY_H_ */ diff --git a/src/obidmscolumn.c b/src/obidmscolumn.c index 5101f68..8c7e930 100644 --- a/src/obidmscolumn.c +++ b/src/obidmscolumn.c @@ -533,12 +533,12 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, obidebug(1, "\nCan't create column because of empty column name"); return NULL; } - if ((data_type < 1) || (data_type > 5)) + if ((data_type < 1) || (data_type > 6)) { obidebug(1, "\nCan't create column because of invalid data type"); return NULL; } - if ((data_type == 5) && (array_name == NULL)) + if (((data_type == 5) || (data_type == 6)) && (array_name == NULL)) { obidebug(1, "\nCan't create column because of empty array name"); return NULL; @@ -701,8 +701,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, if (comments != NULL) strncpy(header->comments, comments, COMMENTS_MAX_LENGTH); - // If the data type is OBI_IDX, the associated obi_array is opened or created - if (data_type == 5) + // If the data type is OBI_STR or OBI_SEQ, the associated obi_array is opened or created + if ((data_type == 5) || (data_type == 6)) { array = obi_array(dms, array_name); if (array == NULL) @@ -838,8 +838,8 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms, column->writable = false; - // If the data type is OBI_IDX, the associated obi_array is opened or created - if ((column->header)->data_type == 5) + // If the data type is OBI_STR or OBI_SEQ, the associated obi_array is opened or created + if (((column->header)->data_type == 5) || ((column->header)->data_type == 6)) { array = obi_array(dms, (column->header)->array_name); if (array == NULL) @@ -1175,7 +1175,8 @@ void obi_ini_to_NA_values(OBIDMS_column_p column, } break; - case OBI_IDX: for (i=start;idata)) + i) = OBIIdx_NA; } diff --git a/src/obidmscolumn_seq.c b/src/obidmscolumn_seq.c new file mode 100644 index 0000000..d8e4d1d --- /dev/null +++ b/src/obidmscolumn_seq.c @@ -0,0 +1,120 @@ +/**************************************************************************** + * OBIDMS_column_seq functions * + ****************************************************************************/ + +/** + * @file obidsmcolumn_seq.c + * @author Celine Mercier + * @date November 18th 2015 + * @brief Functions handling OBIColumns containing data in the form of indices referring to DNA sequences. + */ + + +#include +#include + +#include "obidmscolumn.h" +#include "obitypes.h" +#include "obierrno.h" +#include "obidebug.h" +#include "obiarray.h" + + +#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) + + +/********************************************************************** + * + * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S + * + **********************************************************************/ + +int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx, char* value) +{ + byte_t* value_b; + index_t idx; + + // Check that the line number is not greater than the maximum allowed + if (line_nb >= MAXIMUM_LINE_COUNT) + { + obi_set_errno(OBICOL_UNKNOWN_ERROR); + obidebug(1, "\nError trying to set a value at a line number greater than the maximum allowed"); + return -1; + } + + // Check if the file needs to be enlarged + while ((line_nb+1) > (column->header)->line_count) + { + // Enlarge the file + if (obi_enlarge_column(column) < 0) + return -1; + } + + // Update lines used + if ((line_nb+1) > (column->header)->lines_used) + (column->header)->lines_used = line_nb+1; + + // Encode the value on a byte array with a header + value_b = obi_seq_to_obibytes(value); + if (value_b == NULL) + return -1; + + // Add in the obiarray + idx = obi_array_add(column->array, value_b); + if (idx == -1) + return -1; + + // Add the value's index in the column + *(((index_t*) (column->data)) + (line_nb * ((column->header)->nb_elements_per_line)) + element_idx) = idx; + + free(value_b); + + return 0; +} + + +const char* obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx) +{ + index_t idx; + byte_t* value_b; + + if ((line_nb+1) > (column->header)->lines_used) + { + obi_set_errno(OBICOL_UNKNOWN_ERROR); + obidebug(1, "\nError trying to get a value that is beyond the current number of lines used"); + return "\0"; // TODO + } + + idx = *(((index_t*) (column->data)) + (line_nb * ((column->header)->nb_elements_per_line)) + element_idx); + + // Check NA + if (idx == OBIIdx_NA) + return "\0"; // TODO + + value_b = obi_array_get(column->array, idx); + return obi_obibytes_to_seq(value_b); +} + + +int obi_column_set_obiseq_with_elt_name(OBIDMS_column_p column, index_t line_nb, const char* element_name, char* value) +{ + index_t element_idx; + element_idx = obi_column_get_element_index_from_name(column, element_name); + if (element_idx == OBIIdx_NA) + return -1; + if (obi_column_set_obiseq_with_elt_idx(column, line_nb, element_idx, value) < 0) + return -1; + return 0; +} + + +const char* obi_column_get_obiseq_with_elt_name(OBIDMS_column_p column, index_t line_nb, const char* element_name) +{ + index_t element_idx; + + element_idx = obi_column_get_element_index_from_name(column, element_name); + if (element_idx == OBIIdx_NA) + return "\0"; + return obi_column_get_obiseq_with_elt_idx(column, line_nb, element_idx); +} + diff --git a/src/obidmscolumn_seq.h b/src/obidmscolumn_seq.h new file mode 100644 index 0000000..d35ec8e --- /dev/null +++ b/src/obidmscolumn_seq.h @@ -0,0 +1,101 @@ +/**************************************************************************** + * OBIDMS_column_seq header file * + ****************************************************************************/ + +/** + * @file obidsmcolumn_seq.h + * @author Celine Mercier + * @date Novemeber 18th 2015 + * @brief Header file for the functions handling OBIColumns containing data in the form of indices referring to DNA sequences. + */ + + +#ifndef OBIDMSCOLUMN_SEQ_H_ +#define OBIDMSCOLUMN_SEQ_H_ + + +#include +#include + +#include "obidmscolumn.h" +#include "obitypes.h" + + +/** + * @brief Sets a value in an OBIDMS column containing data in the form of indices referring + * to DNA sequences in an obiarray, using the index of the element in the line. + * + * @warning Pointers returned by obi_open_column() don't allow writing. + * + * @param column A pointer as returned by obi_create_column() or obi_clone_column(). + * @param line_nb The number of the line where the value should be set. + * @param element_idx The index of the element that should be set in the line. + * @param value The value that should be set. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx, char* value); + + +/** + * @brief Recovers a value in an OBIDMS column containing data in the form of indices referring + * to DNA sequences in an obiarray, using the index of the element in the line. + * + * @param column A pointer as returned by obi_create_column(). + * @param line_nb The number of the line where the value should be recovered. + * @param element_idx The index of the element that should be recovered in the line. + * + * @returns The recovered value. + * @retval '\0' the NA value of the type if an error occurred and obi_errno is set. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +const char* obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx); + + +/** + * @brief Sets a value in an OBIDMS column containing data in the form of indices referring + * to DNA sequences in an obiarray, using the name of the element in the line. + * + * @warning Pointers returned by obi_open_column() don't allow writing. + * + * @param column A pointer as returned by obi_create_column() or obi_clone_column(). + * @param line_nb The number of the line where the value should be set. + * @param element_name The name of the element that should be set in the line. + * @param value The value that should be set. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int obi_column_set_obiseq_with_elt_name(OBIDMS_column_p column, index_t line_nb, const char* element_name, char* value); + + +/** + * @brief Recovers a value in an OBIDMS column containing data in the form of indices referring + * to DNA sequences in an obiarray, using the name of the element in the line. + * + * @param column A pointer as returned by obi_create_column() or obi_clone_column(). + * @param line_nb The number of the line where the value should be recovered. + * @param element_name The name of the element that should be recovered in the line. + * + * @returns The recovered value. + * @retval '\0' the NA value of the type if an error occurred and obi_errno is set. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +const char* obi_column_get_obiseq_with_elt_name(OBIDMS_column_p column, index_t line_nb, const char* element_name); + + +#endif /* OBIDMSCOLUMN_SEQ_H_ */ + diff --git a/src/obidmscolumn_str.h b/src/obidmscolumn_str.h index 8a2e195..b1c98de 100644 --- a/src/obidmscolumn_str.h +++ b/src/obidmscolumn_str.h @@ -97,5 +97,5 @@ int obi_column_set_obistr_with_elt_name(OBIDMS_column_p column, index_t line_nb, const char* obi_column_get_obistr_with_elt_name(OBIDMS_column_p column, index_t line_nb, const char* element_name); -#endif /* OBIDMSCOLUMN_IDX_H_ */ +#endif /* OBIDMSCOLUMN_STR_H_ */ diff --git a/src/obitypes.c b/src/obitypes.c index b5bf0b0..7cd9194 100644 --- a/src/obitypes.c +++ b/src/obitypes.c @@ -40,7 +40,10 @@ size_t obi_sizeof(OBIType_t type) case OBI_CHAR: size = sizeof(obichar_t); break; - case OBI_IDX: size = sizeof(index_t); + case OBI_STR: size = sizeof(index_t); + break; + + case OBI_SEQ: size = sizeof(index_t); break; default: size = 0; @@ -90,7 +93,10 @@ char* name_data_type(int data_type) case OBI_CHAR: name = strdup("OBI_CHAR"); break; - case OBI_IDX: name = strdup("OBI_IDX"); + case OBI_STR: name = strdup("OBI_STR"); + break; + + case OBI_SEQ: name = strdup("OBI_SEQ"); break; } diff --git a/src/obitypes.h b/src/obitypes.h index 9f99ec6..3811124 100644 --- a/src/obitypes.h +++ b/src/obitypes.h @@ -44,7 +44,8 @@ typedef enum OBIType { OBI_FLOAT, /**< a floating value (C type : double) */ OBI_BOOL, /**< a boolean true/false value, see obibool_t enum */ OBI_CHAR, /**< a character (C type : char) */ - OBI_IDX /**< an index in a data structure (C type : int64_t) */ + OBI_STR, /**< an index in a data structure (C type : int64_t) referring to a character string*/ + OBI_SEQ /**< an index in a data structure (C type : int64_t) referring to a DNA sequence*/ } OBIType_t, *OBIType_p; @@ -52,7 +53,7 @@ typedef int64_t index_t; typedef int32_t obiint_t; typedef double obifloat_t; typedef char obichar_t; - +// TODO same for obistr_t and obiseq_t ? /** * @brief Union used to compute the NA value of the OBI_FLOAT OBIType.