diff --git a/python/obitools3/obidms/_obidms.cfiles b/python/obitools3/obidms/_obidms.cfiles index bd77619..50d9879 100644 --- a/python/obitools3/obidms/_obidms.cfiles +++ b/python/obitools3/obidms/_obidms.cfiles @@ -2,6 +2,8 @@ ../../../src/bloom.c ../../../src/crc64.h ../../../src/crc64.c +../../../src/dna_seq_indexer.h +../../../src/dna_seq_indexer.c ../../../src/encode.h ../../../src/encode.c ../../../src/MurmurHash2.h diff --git a/python/obitools3/obidms/_obidmscolumn_bool.cfiles b/python/obitools3/obidms/_obidmscolumn_bool.cfiles index 3b0e807..277bc9e 100644 --- a/python/obitools3/obidms/_obidmscolumn_bool.cfiles +++ b/python/obitools3/obidms/_obidmscolumn_bool.cfiles @@ -4,6 +4,8 @@ ../../../src/bloom.c ../../../src/crc64.h ../../../src/crc64.c +../../../src/dna_seq_indexer.h +../../../src/dna_seq_indexer.c ../../../src/encode.h ../../../src/encode.c ../../../src/MurmurHash2.h diff --git a/python/obitools3/obidms/_obidmscolumn_char.cfiles b/python/obitools3/obidms/_obidmscolumn_char.cfiles index b3eb9a9..05d09a0 100644 --- a/python/obitools3/obidms/_obidmscolumn_char.cfiles +++ b/python/obitools3/obidms/_obidmscolumn_char.cfiles @@ -4,6 +4,8 @@ ../../../src/bloom.c ../../../src/crc64.h ../../../src/crc64.c +../../../src/dna_seq_indexer.h +../../../src/dna_seq_indexer.c ../../../src/encode.h ../../../src/encode.c ../../../src/MurmurHash2.h diff --git a/python/obitools3/obidms/_obidmscolumn_float.cfiles b/python/obitools3/obidms/_obidmscolumn_float.cfiles index e74ace4..3619c44 100644 --- a/python/obitools3/obidms/_obidmscolumn_float.cfiles +++ b/python/obitools3/obidms/_obidmscolumn_float.cfiles @@ -4,6 +4,8 @@ ../../../src/bloom.c ../../../src/crc64.h ../../../src/crc64.c +../../../src/dna_seq_indexer.h +../../../src/dna_seq_indexer.c ../../../src/encode.h ../../../src/encode.c ../../../src/MurmurHash2.h diff --git a/python/obitools3/obidms/_obidmscolumn_int.cfiles b/python/obitools3/obidms/_obidmscolumn_int.cfiles index 49cd1eb..9c0776f 100644 --- a/python/obitools3/obidms/_obidmscolumn_int.cfiles +++ b/python/obitools3/obidms/_obidmscolumn_int.cfiles @@ -4,6 +4,8 @@ ../../../src/bloom.c ../../../src/crc64.h ../../../src/crc64.c +../../../src/dna_seq_indexer.h +../../../src/dna_seq_indexer.c ../../../src/encode.h ../../../src/encode.c ../../../src/MurmurHash2.h diff --git a/python/obitools3/obidms/_obidmscolumn_seq.cfiles b/python/obitools3/obidms/_obidmscolumn_seq.cfiles index 5eb2343..7ab8ac1 100644 --- a/python/obitools3/obidms/_obidmscolumn_seq.cfiles +++ b/python/obitools3/obidms/_obidmscolumn_seq.cfiles @@ -4,6 +4,8 @@ ../../../src/bloom.c ../../../src/crc64.h ../../../src/crc64.c +../../../src/dna_seq_indexer.h +../../../src/dna_seq_indexer.c ../../../src/encode.h ../../../src/encode.c ../../../src/MurmurHash2.h diff --git a/python/obitools3/obidms/_obidmscolumn_str.cfiles b/python/obitools3/obidms/_obidmscolumn_str.cfiles index 52cda64..b987965 100644 --- a/python/obitools3/obidms/_obidmscolumn_str.cfiles +++ b/python/obitools3/obidms/_obidmscolumn_str.cfiles @@ -4,6 +4,8 @@ ../../../src/bloom.c ../../../src/crc64.h ../../../src/crc64.c +../../../src/dna_seq_indexer.h +../../../src/dna_seq_indexer.c ../../../src/encode.h ../../../src/encode.c ../../../src/MurmurHash2.h diff --git a/python/obitools3/obidms/_obiseq.cfiles b/python/obitools3/obidms/_obiseq.cfiles index bd77619..50d9879 100644 --- a/python/obitools3/obidms/_obiseq.cfiles +++ b/python/obitools3/obidms/_obiseq.cfiles @@ -2,6 +2,8 @@ ../../../src/bloom.c ../../../src/crc64.h ../../../src/crc64.c +../../../src/dna_seq_indexer.h +../../../src/dna_seq_indexer.c ../../../src/encode.h ../../../src/encode.c ../../../src/MurmurHash2.h diff --git a/python/obitools3/obidms/_obitaxo.cfiles b/python/obitools3/obidms/_obitaxo.cfiles index bd77619..50d9879 100644 --- a/python/obitools3/obidms/_obitaxo.cfiles +++ b/python/obitools3/obidms/_obitaxo.cfiles @@ -2,6 +2,8 @@ ../../../src/bloom.c ../../../src/crc64.h ../../../src/crc64.c +../../../src/dna_seq_indexer.h +../../../src/dna_seq_indexer.c ../../../src/encode.h ../../../src/encode.c ../../../src/MurmurHash2.h diff --git a/src/dna_seq_indexer.c b/src/dna_seq_indexer.c new file mode 100644 index 0000000..26e4566 --- /dev/null +++ b/src/dna_seq_indexer.c @@ -0,0 +1,106 @@ +/**************************************************************************** + * DNA sequence indexing functions * + ****************************************************************************/ + +/** + * @file dna_seq_indexer.c + * @author Celine Mercier + * @date April 12th 2016 + * @brief Functions handling the indexing and retrieval of DNA sequences. + */ + + +#include +#include +#include + +#include "obiblob.h" +#include "obiblob_indexer.h" +#include "obidms.h" + + +#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) + + +Obi_blob_p obi_seq_to_blob(const char* seq) +{ + Obi_blob_p value_b; + int32_t length_encoded_seq; // length of the encoded sequence in bytes + int32_t seq_length; + byte_t* encoded_seq; + + seq_length = strlen(seq); + + // Check if just ATGC and encode accordingly + if (only_ATGC(seq)) + { + // Compute the length (in bytes) of the encoded sequence + length_encoded_seq = ceil((double) seq_length / (double) 4.0); + // Encode + encoded_seq = encode_seq_on_2_bits(seq, seq_length); + if (encoded_seq == NULL) + return NULL; + value_b = obi_blob(encoded_seq, ELEMENT_SIZE_SEQ_2, length_encoded_seq, seq_length); + } + else + { + // Compute the length (in bytes) of the encoded sequence + length_encoded_seq = ceil((double) seq_length / (double) 2.0); + // Encode + encoded_seq = encode_seq_on_4_bits(seq, seq_length); + if (encoded_seq == NULL) + return NULL; + value_b = obi_blob(encoded_seq, ELEMENT_SIZE_SEQ_4, length_encoded_seq, seq_length); + } + + free(encoded_seq); + + return value_b; +} + + +char* obi_blob_to_seq(Obi_blob_p value_b) +{ + // Decode + if (value_b->element_size == 2) + return decode_seq_on_2_bits(value_b->value, value_b->length_decoded_value); + else + return decode_seq_on_4_bits(value_b->value, value_b->length_decoded_value); +} + + +index_t obi_index_dna_seq(Obi_indexer_p indexer, const char* value) +{ + Obi_blob_p value_b; + index_t idx; + + // Encode value + value_b = obi_seq_to_blob(value); + if (value_b == NULL) + return -1; + + // Add in the indexer + idx = obi_indexer_add(indexer, value_b); + + free(value_b); + + return idx; +} + + +char* obi_retrieve_dna_seq(Obi_indexer_p indexer, index_t idx) +{ + Obi_blob_p value_b; + char* seq; + + // Get encoded value + value_b = obi_indexer_get(indexer, idx); + + // Decode sequence + seq = obi_blob_to_seq(value_b); + + free(value_b); + + return seq; +} + diff --git a/src/dna_seq_indexer.h b/src/dna_seq_indexer.h new file mode 100644 index 0000000..641652c --- /dev/null +++ b/src/dna_seq_indexer.h @@ -0,0 +1,63 @@ +/**************************************************************************** + * DNA sequence indexer header file * + ****************************************************************************/ + +/** + * @file dna_seq_indexer.h + * @author Celine Mercier + * @date April 12th 2016 + * @brief Header file for the functions handling the indexing of DNA sequences. + */ + + +#ifndef DNA_SEQ_INDEXER_H_ +#define DNA_SEQ_INDEXER_H_ + + +#include +#include + +#include "obidms.h" +#include "obitypes.h" +#include "obiblob.h" +#include "obiblob_indexer.h" + + +/** + * @brief Converts a DNA sequence to a blob. + * + * @warning The blob must be freed by the caller. + * + * @param value The DNA sequence to convert. + * + * @returns A pointer to the blob created. + * @retval NULL if an error occurred. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +Obi_blob_p obi_seq_to_blob(const char* seq); + + +/** + * @brief Converts a blob to a DNA sequence. + * + * @param value_b The blob to convert. + * + * @returns A pointer to the DNA sequence contained in the blob. + * @retval NULL if an error occurred. + * + * @since November 2015 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +char* obi_blob_to_seq(Obi_blob_p value_b); + + +// TODO doc +index_t obi_index_dna_seq(Obi_indexer_p indexer, const char* value); + +char* obi_retrieve_dna_seq(Obi_indexer_p indexer, index_t idx); + + +#endif /* DNA_SEQ_INDEXER_H_ */ + diff --git a/src/encode.c b/src/encode.c index e2ff6a8..6faebc2 100644 --- a/src/encode.c +++ b/src/encode.c @@ -54,7 +54,7 @@ bool only_ATGC(const char* seq) } -byte_t* encode_seq_on_2_bits(char* seq, int32_t length) +byte_t* encode_seq_on_2_bits(const char* seq, int32_t length) { byte_t* seq_b; uint8_t modulo; @@ -163,7 +163,7 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq) } -byte_t* encode_seq_on_4_bits(char* seq, int32_t length) +byte_t* encode_seq_on_4_bits(const char* seq, int32_t length) { byte_t* seq_b; uint8_t modulo; diff --git a/src/encode.h b/src/encode.h index ca6c6ff..82c4573 100644 --- a/src/encode.h +++ b/src/encode.h @@ -96,7 +96,7 @@ bool only_ATGC(const char* seq); * @since November 2015 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -byte_t* encode_seq_on_2_bits(char* seq, int32_t length); +byte_t* encode_seq_on_2_bits(const char* seq, int32_t length); /** @@ -147,7 +147,7 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq); * @since November 2015 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -byte_t* encode_seq_on_4_bits(char* seq, int32_t length); +byte_t* encode_seq_on_4_bits(const char* seq, int32_t length); /** diff --git a/src/obiblob.c b/src/obiblob.c index 842ef1c..aa438ab 100644 --- a/src/obiblob.c +++ b/src/obiblob.c @@ -82,52 +82,5 @@ const char* obi_blob_to_str(Obi_blob_p value_b) } -Obi_blob_p obi_seq_to_blob(char* seq) -{ - Obi_blob_p value_b; - int32_t length_encoded_seq; // length of the encoded sequence in bytes - int32_t seq_length; - byte_t* encoded_seq; - - seq_length = strlen(seq); - - // Check if just ATGC and encode accordingly - if (only_ATGC(seq)) - { - // Compute the length (in bytes) of the encoded sequence - length_encoded_seq = ceil((double) seq_length / (double) 4.0); - // Encode - encoded_seq = encode_seq_on_2_bits(seq, seq_length); - if (encoded_seq == NULL) - return NULL; - value_b = obi_blob(encoded_seq, ELEMENT_SIZE_SEQ_2, length_encoded_seq, seq_length); - } - else - { - // Compute the length (in bytes) of the encoded sequence - length_encoded_seq = ceil((double) seq_length / (double) 2.0); - // Encode - encoded_seq = encode_seq_on_4_bits(seq, seq_length); - if (encoded_seq == NULL) - return NULL; - value_b = obi_blob(encoded_seq, ELEMENT_SIZE_SEQ_4, length_encoded_seq, seq_length); - } - - free(encoded_seq); - - return value_b; -} - - -const char* obi_blob_to_seq(Obi_blob_p value_b) -{ - // Decode - if (value_b->element_size == 2) - return decode_seq_on_2_bits(value_b->value, value_b->length_decoded_value); - else - return decode_seq_on_4_bits(value_b->value, value_b->length_decoded_value); -} - - // TODO same for int diff --git a/src/obiblob.h b/src/obiblob.h index b60e633..f39b9c8 100644 --- a/src/obiblob.h +++ b/src/obiblob.h @@ -45,6 +45,11 @@ typedef struct Obi_blob { } Obi_blob_t, *Obi_blob_p; + +// TODO doc +Obi_blob_p obi_blob(byte_t* encoded_value, uint8_t element_size, int32_t length_encoded_value, int32_t length_decoded_value); + + /** * @brief Converts a character string to a blob. * @@ -74,35 +79,5 @@ Obi_blob_p obi_str_to_blob(char* value); const char* obi_blob_to_str(Obi_blob_p value_b); -/** - * @brief Converts a DNA sequence to a blob with a header. - * - * @warning The blob must be freed by the caller. - * - * @param value The DNA sequence to convert. - * - * @returns A pointer to the blob created. - * @retval NULL if an error occurred. - * - * @since November 2015 - * @author Celine Mercier (celine.mercier@metabarcoding.org) - */ -Obi_blob_p obi_seq_to_blob(char* seq); - - -/** - * @brief Converts a blob to a DNA sequence. - * - * @param value_b The blob to convert. - * - * @returns A pointer to the DNA sequence contained in the blob. - * @retval NULL if an error occurred. - * - * @since November 2015 - * @author Celine Mercier (celine.mercier@metabarcoding.org) - */ -const char* obi_blob_to_seq(Obi_blob_p value_b); // TODO move to encode source files - - #endif /* OBIBLOB_H_ */ diff --git a/src/obidmscolumn_seq.c b/src/obidmscolumn_seq.c index f52dfe6..66caddc 100644 --- a/src/obidmscolumn_seq.c +++ b/src/obidmscolumn_seq.c @@ -18,7 +18,7 @@ #include "obitypes.h" #include "obierrno.h" #include "obidebug.h" -#include "obiblob_indexer.h" +#include "dna_seq_indexer.h" #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) @@ -33,7 +33,6 @@ int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx, char* value) { - Obi_blob_p value_b; index_t idx; // Check that the line number is not greater than the maximum allowed @@ -56,21 +55,13 @@ int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, if ((line_nb+1) > (column->header)->lines_used) (column->header)->lines_used = line_nb+1; - // Encode the value on a byte array with a header // TODO make function - value_b = obi_seq_to_blob(value); - if (value_b == NULL) - return -1; - - // Add in the indexer - idx = obi_indexer_add(column->indexer, value_b); + idx = obi_index_dna_seq(column->indexer, value); if (idx == -1) return -1; // Add the value's index in the column *(((index_t*) (column->data)) + (line_nb * ((column->header)->nb_elements_per_line)) + element_idx) = idx; - free(value_b); - return 0; } @@ -114,8 +105,7 @@ int obi_column_set_obiseq_with_elt_idx_in_view(Obiview_p view, OBIDMS_column_p c const char* obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, index_t element_idx) { - index_t idx; - Obi_blob_p value_b; + index_t idx; if ((line_nb+1) > ((column->header)->line_count)) { @@ -130,9 +120,7 @@ const char* obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column, index_t l if (idx == OBIIdx_NA) return OBISeq_NA; - value_b = obi_indexer_get(column->indexer, idx); - - return obi_blob_to_seq(value_b); + return obi_retrieve_dna_seq(column->indexer, idx); }