diff --git a/python/obitools3/obidms/_obidms.cfiles b/python/obitools3/obidms/_obidms.cfiles index 939bafa..1fcbfd6 100644 --- a/python/obitools3/obidms/_obidms.cfiles +++ b/python/obitools3/obidms/_obidms.cfiles @@ -22,3 +22,7 @@ ../../../src/obidmscolumn_idx.c ../../../src/obidms_taxonomy.c ../../../src/obidms_taxonomy.h +../../../src/bloom.c +../../../src/bloom.h +../../../src/MurmurHash2.c +../../../src/murmurhash2.h \ No newline at end of file diff --git a/src/MurmurHash2.c b/src/MurmurHash2.c new file mode 100755 index 0000000..32c4c32 --- /dev/null +++ b/src/MurmurHash2.c @@ -0,0 +1,64 @@ +//----------------------------------------------------------------------------- +// MurmurHash2, by Austin Appleby + +// Note - This code makes a few assumptions about how your machine behaves - + +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 + +// And it has a few limitations - + +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. + +unsigned int murmurhash2(const void * key, int len, const unsigned int seed) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} diff --git a/src/bloom.c b/src/bloom.c new file mode 100755 index 0000000..ab60aa6 --- /dev/null +++ b/src/bloom.c @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012-2015, Jyri J. Virkki + * All rights reserved. + * + * This file is under BSD license. See LICENSE file. + */ + +/* + * Refer to bloom.h for documentation on the public interfaces. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bloom.h" +#include "murmurhash2.h" + +#define MAKESTRING(n) STRING(n) +#define STRING(n) #n + +#ifdef __linux__ +unsigned detect_bucket_size(unsigned fallback_size); +#endif + + +static int test_bit_set_bit(unsigned char * buf, unsigned int x, int set_bit) +{ + register uint32_t * word_buf = (uint32_t *)buf; + register unsigned int offset = x >> 5; + register uint32_t word = word_buf[offset]; + register unsigned int mask = 1 << (x % 32); + + if (word & mask) { + return 1; + } else { + if (set_bit) { + word_buf[offset] = word | mask; + } + return 0; + } +} + + +static int bloom_check_add(struct bloom * bloom, + const void * buffer, int len, int add) +{ + if (bloom->ready == 0) { + (void)printf("bloom at %p not initialized!\n", (void *)bloom); + return -1; + } + + int hits = 0; + register unsigned int a = murmurhash2(buffer, len, 0x9747b28c); + register unsigned int b = murmurhash2(buffer, len, a); + register unsigned int x; + register unsigned int i; + + unsigned bucket_index = (a % bloom->buckets); + + unsigned char * bucket_ptr = + (bloom->bf + (bucket_index << bloom->bucket_bytes_exponent)); + + for (i = 0; i < bloom->hashes; i++) { + x = (a + i*b) & bloom->bucket_bits_fast_mod_operand; + if (test_bit_set_bit(bucket_ptr, x, add)) { + hits++; + } + } + + if (hits == bloom->hashes) { + return 1; // 1 == element already in (or collision) + } + + return 0; +} + + +static void setup_buckets(struct bloom * bloom, unsigned int cache_size) +{ + // If caller passed a non-zero cache_size, use it as given, otherwise + // either compute it or use built-in default + + if (cache_size == 0) { +#ifdef __linux__ + cache_size = detect_bucket_size(BLOOM_BUCKET_SIZE_FALLBACK); +#else + cache_size = BLOOM_BUCKET_SIZE_FALLBACK; +#endif + } + + bloom->buckets = (bloom->bytes / cache_size); + bloom->bucket_bytes = cache_size; + + // make sure bloom buffer bytes and bucket_bytes are even + int not_even_by = (bloom->bytes % bloom->bucket_bytes); + + if (not_even_by) { + // adjust bytes + bloom->bytes += (bloom->bucket_bytes - not_even_by); + assert((bloom->bytes % bloom->bucket_bytes) == 0); // Should get even + + // adjust bits + bloom->bits = bloom->bytes * 8; + + // adjust bits per element + bloom->bpe = bloom->bits*1. / bloom->entries; + + // adjust buckets + bloom->buckets++; + } + + bloom->bucket_bytes_exponent = __builtin_ctz(cache_size); + bloom->bucket_bits_fast_mod_operand = (cache_size * 8 - 1); +} + + +int bloom_init_size(struct bloom * bloom, int entries, double error, + unsigned int cache_size) +{ + bloom->ready = 0; + + if (entries < 1 || error == 0) { + return 1; + } + + bloom->entries = entries; + bloom->error = error; + + double num = log(bloom->error); + double denom = 0.480453013918201; // ln(2)^2 + bloom->bpe = -(num / denom); + + double dentries = (double)entries; + bloom->bits = (int)(dentries * bloom->bpe); + + if (bloom->bits % 8) { + bloom->bytes = (bloom->bits / 8) + 1; + } else { + bloom->bytes = bloom->bits / 8; + } + + bloom->hashes = (int)ceil(0.693147180559945 * bloom->bpe); // ln(2) + + setup_buckets(bloom, cache_size); + + bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char)); + if (bloom->bf == NULL) { + return 1; + } + + bloom->ready = 1; + return 0; +} + + +int bloom_init(struct bloom * bloom, int entries, double error) +{ + return bloom_init_size(bloom, entries, error, 0); +} + + +int bloom_check(struct bloom * bloom, const void * buffer, int len) +{ + return bloom_check_add(bloom, buffer, len, 0); +} + + +int bloom_add(struct bloom * bloom, const void * buffer, int len) +{ + return bloom_check_add(bloom, buffer, len, 1); +} + + +void bloom_print(struct bloom * bloom) +{ + (void)printf("bloom at %p\n", (void *)bloom); + (void)printf(" ->entries = %d\n", bloom->entries); + (void)printf(" ->error = %f\n", bloom->error); + (void)printf(" ->bits = %d\n", bloom->bits); + (void)printf(" ->bits per elem = %f\n", bloom->bpe); + (void)printf(" ->bytes = %d\n", bloom->bytes); + (void)printf(" ->buckets = %u\n", bloom->buckets); + (void)printf(" ->bucket_bytes = %u\n", bloom->bucket_bytes); + (void)printf(" ->bucket_bytes_exponent = %u\n", + bloom->bucket_bytes_exponent); + (void)printf(" ->bucket_bits_fast_mod_operand = 0%o\n", + bloom->bucket_bits_fast_mod_operand); + (void)printf(" ->hash functions = %d\n", bloom->hashes); +} + + +void bloom_free(struct bloom * bloom) +{ + if (bloom->ready) { + free(bloom->bf); + } + bloom->ready = 0; +} + + +const char * bloom_version() +{ + return MAKESTRING(BLOOM_VERSION); +} diff --git a/src/bloom.h b/src/bloom.h new file mode 100755 index 0000000..203584a --- /dev/null +++ b/src/bloom.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2012-2015, Jyri J. Virkki + * All rights reserved. + * + * This file is under BSD license. See LICENSE file. + */ + +#ifndef _BLOOM_H +#define _BLOOM_H + + +/** *************************************************************************** + * On Linux, the code attempts to compute a bucket size based on CPU cache + * size info, if available. If that fails for any reason, this fallback size + * is used instead. + * + * On non-Linux systems, this is the bucket size always used unless the + * caller overrides it (see bloom_init_size()). + * + */ +#define BLOOM_BUCKET_SIZE_FALLBACK (32 * 1024) + + +/** *************************************************************************** + * It was found that using multiplier x0.5 for CPU L1 cache size is + * more effective in terms of CPU usage and, surprisingly, collisions + * number. + * + * Feel free to tune this constant the way it will work for you. + * + */ +#define BLOOM_L1_CACHE_SIZE_DIV 1 + + +/** *************************************************************************** + * Structure to keep track of one bloom filter. Caller needs to + * allocate this and pass it to the functions below. First call for + * every struct must be to bloom_init(). + * + */ +struct bloom +{ + // These fields are part of the public interface of this structure. + // Client code may read these values if desired. Client code MUST NOT + // modify any of these. + int entries; + double error; + int bits; + int bytes; + int hashes; + + // Fields below are private to the implementation. These may go away or + // change incompatibly at any moment. Client code MUST NOT access or rely + // on these. + unsigned buckets; + unsigned bucket_bytes; + + // x86 CPU divide by/multiply by operation optimization helpers + unsigned bucket_bytes_exponent; + unsigned bucket_bits_fast_mod_operand; + + double bpe; + unsigned char * bf; + int ready; +}; + + +/** *************************************************************************** + * Initialize the bloom filter for use. + * + * The filter is initialized with a bit field and number of hash functions + * according to the computations from the wikipedia entry: + * http://en.wikipedia.org/wiki/Bloom_filter + * + * Optimal number of bits is: + * bits = (entries * ln(error)) / ln(2)^2 + * + * Optimal number of hash functions is: + * hashes = bpe * ln(2) + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * entries - The expected number of entries which will be inserted. + * error - Probability of collision (as long as entries are not + * exceeded). + * + * Return: + * ------- + * 0 - on success + * 1 - on failure + * + */ +int bloom_init(struct bloom * bloom, int entries, double error); + + +/** *************************************************************************** + * Initialize the bloom filter for use. + * + * See comments above for general information. + * + * This is the same as bloom_init() but allows the caller to pass in a + * cache_size to override the internal value (which is either computed + * or the default of BLOOM_BUCKET_SIZE_FALLBACK). Mostly useful for + * experimenting. + * + * See misc/bucketsize for a script which can help identify a good value + * for cache_size. + * + */ +int bloom_init_size(struct bloom * bloom, int entries, double error, + unsigned int cache_size); + + +/** *************************************************************************** + * Check if the given element is in the bloom filter. Remember this may + * return false positive if a collision occured. + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * buffer - Pointer to buffer containing element to check. + * len - Size of 'buffer'. + * + * Return: + * ------- + * 0 - element is not present + * 1 - element is present (or false positive due to collision) + * -1 - bloom not initialized + * + */ +int bloom_check(struct bloom * bloom, const void * buffer, int len); + + +/** *************************************************************************** + * Add the given element to the bloom filter. + * The return code indicates if the element (or a collision) was already in, + * so for the common check+add use case, no need to call check separately. + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * buffer - Pointer to buffer containing element to add. + * len - Size of 'buffer'. + * + * Return: + * ------- + * 0 - element was not present and was added + * 1 - element (or a collision) had already been added previously + * -1 - bloom not initialized + * + */ +int bloom_add(struct bloom * bloom, const void * buffer, int len); + + +/** *************************************************************************** + * Print (to stdout) info about this bloom filter. Debugging aid. + * + */ +void bloom_print(struct bloom * bloom); + + +/** *************************************************************************** + * Deallocate internal storage. + * + * Upon return, the bloom struct is no longer usable. You may call bloom_init + * again on the same struct to reinitialize it again. + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * + * Return: none + * + */ +void bloom_free(struct bloom * bloom); + + +/** *************************************************************************** + * Returns version string compiled into library. + * + * Return: version string + * + */ +const char * bloom_version(); + + +#endif diff --git a/src/murmurhash2.h b/src/murmurhash2.h new file mode 100755 index 0000000..e607381 --- /dev/null +++ b/src/murmurhash2.h @@ -0,0 +1,7 @@ + +#ifndef _BLOOM_MURMURHASH2 +#define _BLOOM_MURMURHASH2 + +unsigned int murmurhash2(const void * key, int len, const unsigned int seed); + +#endif diff --git a/src/obiavl.c b/src/obiavl.c index 3eb7450..000ec7d 100644 --- a/src/obiavl.c +++ b/src/obiavl.c @@ -19,6 +19,9 @@ #include #include +//#include +#include "bloom.h" + #include "obiavl.h" #include "obierrno.h" #include "obitypes.h" @@ -30,6 +33,7 @@ #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) + /************************************************************************** * * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S @@ -481,21 +485,23 @@ int grow_avl(OBIDMS_avl_p avl) // TODO Lock when needed int avl_file_descriptor; char* avl_file_name; - // Get the avl file name - avl_file_name = build_avl_file_name((avl->header)->avl_name); - if (avl_file_name == NULL) - return -1; + avl_file_descriptor = avl->avl_fd; - // Open the avl file - avl_file_descriptor = openat(avl->dir_fd, avl_file_name, O_RDWR); - if (avl_file_descriptor < 0) - { - obi_set_errno(OBI_AVL_ERROR); - obidebug(1, "\nError opening an AVL tree file"); - free(avl_file_name); - return -1; - } - free(avl_file_name); +// // Get the avl file name +// avl_file_name = build_avl_file_name((avl->header)->avl_name); +// if (avl_file_name == NULL) +// return -1; +// +// // Open the avl file +// avl_file_descriptor = openat(avl->dir_fd, avl_file_name, O_RDWR); +// if (avl_file_descriptor < 0) +// { +// obi_set_errno(OBI_AVL_ERROR); +// obidebug(1, "\nError opening an AVL tree file"); +// free(avl_file_name); +// return -1; +// } +// free(avl_file_name); // Calculate the new file size old_data_size = (avl->header)->avl_size; @@ -544,7 +550,7 @@ int grow_avl(OBIDMS_avl_p avl) // TODO Lock when needed // Set the new avl size (avl->header)->avl_size = new_data_size; - close(avl_file_descriptor); + //close(avl_file_descriptor); return 0; } @@ -559,21 +565,23 @@ int grow_avl_data(OBIDMS_avl_p avl) // TODO Lock when needed int avl_data_file_descriptor; char* avl_data_file_name; - // Get the avl data file name - avl_data_file_name = build_avl_data_file_name((avl->header)->avl_name); - if (avl_data_file_name == NULL) - return -1; + avl_data_file_descriptor = avl->data_fd; - // Open the avl data file - avl_data_file_descriptor = openat(avl->dir_fd, avl_data_file_name, O_RDWR); - if (avl_data_file_descriptor < 0) - { - obi_set_errno(OBI_AVL_ERROR); - obidebug(1, "\nError opening an AVL tree data file"); - free(avl_data_file_name); - return -1; - } - free(avl_data_file_name); +// // Get the avl data file name +// avl_data_file_name = build_avl_data_file_name((avl->header)->avl_name); +// if (avl_data_file_name == NULL) +// return -1; +// +// // Open the avl data file +// avl_data_file_descriptor = openat(avl->dir_fd, avl_data_file_name, O_RDWR); +// if (avl_data_file_descriptor < 0) +// { +// obi_set_errno(OBI_AVL_ERROR); +// obidebug(1, "\nError opening an AVL tree data file"); +// free(avl_data_file_name); +// return -1; +// } +// free(avl_data_file_name); // Calculate the new file size old_data_size = ((avl->data)->header)->data_size_max; @@ -619,10 +627,12 @@ int grow_avl_data(OBIDMS_avl_p avl) // TODO Lock when needed // Set new data size ((avl->data)->header)->data_size_max = new_data_size; + //fprintf(stderr, "\nGrowing AVL, new data size = %lld, count = %ld\n", new_data_size, (avl->header)->nb_items); + // Initialize new data to 0 memset(((avl->data)->data)+old_data_size, 0, new_data_size - old_data_size); - close(avl_data_file_descriptor); + //close(avl_data_file_descriptor); return 0; } @@ -996,6 +1006,74 @@ OBIDMS_avl_p obi_avl(OBIDMS_p dms, const char* avl_name) } +OBIDMS_avl_group_p obi_create_avl_group(OBIDMS_p dms, const char* avl_name) +{ + OBIDMS_avl_group_p avl_group; + char* avl_name_with_idx; + + avl_group = (OBIDMS_avl_group_p) malloc(sizeof(OBIDMS_avl_group_t)); + + // Create 1st avl + asprintf(&avl_name_with_idx,"%s_%u", avl_name, 0); + (avl_group->sub_avls)[0] = obi_create_avl(dms, avl_name_with_idx); + avl_group->current_avl_idx = 0; + strcpy(avl_group->avl_name, avl_name); + + avl_group->dms = dms; + + return avl_group; +} + + +int unmap_an_avl(OBIDMS_avl_p avl) +{ + if (munmap((avl->data)->data, ((avl->data)->header)->data_size_max) < 0) + return -1; + if (munmap(avl->tree, (((avl->header)->nb_items_max) * sizeof(AVL_node_t))) < 0) + return -1; + return 0; +} + + +int remap_an_avl(OBIDMS_avl_p avl) +{ + (avl->data)->data = mmap(NULL, + ((avl->data)->header)->data_size_max, + PROT_READ | PROT_WRITE, + MAP_SHARED, + avl->data_fd, + ((avl->data)->header)->header_size); + if ((avl->data)->data == NULL) + return -1; + + avl->tree = mmap(NULL, + ((avl->header)->nb_items_max) * sizeof(AVL_node_t), + PROT_READ | PROT_WRITE, + MAP_SHARED, + avl->avl_fd, + (avl->header)->header_size); + if (avl->tree == NULL) + return -1; + + return 0; +} + + +int obi_add_new_avl_in_group(OBIDMS_avl_group_p avl_group) // TODO check for errors +{ + char* avl_name_with_idx; + + // unmap older + unmap_an_avl((avl_group->sub_avls)[avl_group->current_avl_idx]); + + (avl_group->current_avl_idx)++; + asprintf(&avl_name_with_idx,"%s_%u", avl_group->avl_name, avl_group->current_avl_idx); + (avl_group->sub_avls)[avl_group->current_avl_idx] = obi_create_avl(avl_group->dms, avl_name_with_idx); + + return 0; +} + + OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name) { char* avl_file_name; @@ -1098,7 +1176,7 @@ OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name) // Initialize all bits to 0 memset(avl_data->data, 0, (avl_data->header)->data_size_max); - close(avl_data_file_descriptor); + //close(avl_data_file_descriptor); // Create the AVL tree file @@ -1198,7 +1276,13 @@ OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name) (avl->header)->creation_date = time(NULL); strcpy((avl->header)->avl_name, avl_name); - close(avl_file_descriptor); + avl->avl_fd = avl_file_descriptor; + avl->data_fd = avl_data_file_descriptor; + + // Bloom filter + bloom_init(&((avl->header)->bloom_filter), 2000000, 0.001); // TODO use macros + + //close(avl_file_descriptor); // Add in the list of opened AVL trees *(((dms->opened_avls)->avls)+((dms->opened_avls)->nb_opened_avls)) = avl; @@ -1305,7 +1389,7 @@ OBIDMS_avl_p obi_open_avl(OBIDMS_p dms, const char* avl_name) return NULL; } - close(avl_data_file_descriptor); + //close(avl_data_file_descriptor); // Open the AVL tree file @@ -1391,7 +1475,10 @@ OBIDMS_avl_p obi_open_avl(OBIDMS_p dms, const char* avl_name) avl->directory = dms->avl_directory; avl->dir_fd = avl_dir_file_descriptor; - close(avl_file_descriptor); + avl->avl_fd = avl_file_descriptor; + avl->data_fd = avl_data_file_descriptor; + + //close(avl_file_descriptor); // Add in the list of opened AVL trees *(((dms->opened_avls)->avls)+((dms->opened_avls)->nb_opened_avls)) = avl; @@ -1456,6 +1543,53 @@ byte_t* obi_avl_get(OBIDMS_avl_p avl, index_t idx) } +int maybe_in_avl(OBIDMS_avl_p avl, byte_t* value) +{ + return (bloom_check(&((avl->header)->bloom_filter), value, (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1))))); +} + + +index_t insert_in_avl_group(OBIDMS_avl_group_p avl_group, byte_t* value) // TODO won't be index_t +{ + index_t index_if_already_in; + int i; + + if (maybe_in_avl((avl_group->sub_avls)[avl_group->current_avl_idx], value)) + { + //fprintf(stderr, "\nyah maybe"); + index_if_already_in = obi_avl_find((avl_group->sub_avls)[avl_group->current_avl_idx], value); + if (index_if_already_in >= 0) + return index_if_already_in; + } +// else +// fprintf(stderr, "\nnah"); + for (i=0; i < (avl_group->current_avl_idx); i++) + { + if (maybe_in_avl((avl_group->sub_avls)[i], value)) + { + //fprintf(stderr, "\nyah maybe"); + if (remap_an_avl((avl_group->sub_avls)[i]) < 0) + return -1; + index_if_already_in = obi_avl_find((avl_group->sub_avls)[i], value); + if (unmap_an_avl((avl_group->sub_avls)[i]) < 0) + return -1; + if (index_if_already_in >= 0) + return index_if_already_in; + } +// else +// fprintf(stderr, "\nnah"); + } + + // not found in any avl: add in current + // first, check if make new one + if ((((avl_group->sub_avls)[avl_group->current_avl_idx])->header)->nb_items == 2000000) // TODO add condition with data size + use macro + obi_add_new_avl_in_group(avl_group); + + bloom_add(&((((avl_group->sub_avls)[avl_group->current_avl_idx])->header)->bloom_filter), value, (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1)))); + return obi_avl_add((avl_group->sub_avls)[avl_group->current_avl_idx], value); +} + + // Insert a new node index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value) { @@ -1519,7 +1653,10 @@ index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value) next = current_node->right_child; else if (comp == 0) // Value already stored - return current_node->value; + { + //fprintf(stderr, "\n>>>ALREADY IN, %s, %lld\n", obi_obibytes_to_seq(value), (avl->header)->nb_items); + return current_node->value; // TODO should trigger error if using bloom filters + } depth++; } @@ -1576,7 +1713,7 @@ index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value) } -// Find if a value is already in an AVL tree +// Find if a value is already in an AVL tree TODO use bloom index_t obi_avl_find(OBIDMS_avl_p avl, byte_t* value) { int comp; @@ -1632,7 +1769,7 @@ byte_t* obi_str_to_obibytes(char* value) *((int32_t*)(value_b+1)) = length; // Store the initial length (in bytes) of the decoded value (same as encoded for character strings) - *((int32_t*)(value_b+5)) = length; + *((int64_t*)(value_b+5)) = length; // Store the character string strcpy(value_b+BYTE_ARRAY_HEADER_SIZE, value); diff --git a/src/obiavl.h b/src/obiavl.h index bc4e918..1850455 100644 --- a/src/obiavl.h +++ b/src/obiavl.h @@ -25,6 +25,10 @@ #include "obidms.h" #include "obitypes.h" +#include "bloom.h" + +#define NB_OF_AVLS (64) +#define MASK (63) #define AVL_MAX_NAME (1024) /**< The maximum length of an AVL tree name. */ @@ -39,6 +43,8 @@ #define BYTE_ARRAY_HEADER_SIZE (9) /**< The size of the header of a byte array. */ +typedef struct bloom bloom_t; + /** * @brief AVL tree node structure. @@ -48,7 +54,7 @@ typedef struct AVL_node { */ index_t right_child; /**< Index of right greater child node. */ - int8_t balance_factor; /**< Balance factor of the node. + int8_t balance_factor; /**< Balance factor of the node. */ index_t value; /**< Index of the value associated with the node in the data array. */ @@ -103,6 +109,7 @@ typedef struct OBIDMS_avl_header { */ time_t creation_date; /**< Date of creation of the file. */ + bloom_t bloom_filter; } OBIDMS_avl_header_t, *OBIDMS_avl_header_p; @@ -132,9 +139,28 @@ typedef struct OBIDMS_avl { */ size_t counter; /**< Indicates by how many threads/programs (TODO) the AVL tree is used. */ + int avl_fd; + int data_fd; } OBIDMS_avl_t, *OBIDMS_avl_p; +/** + * @brief OBIDMS AVL tree group structure. + */ +typedef struct OBIDMS_avl_group { + // TODO put each group in a directory later + OBIDMS_avl_p sub_avls[64]; // TODO macro for max + int current_avl_idx; + char avl_name[AVL_MAX_NAME+1]; + OBIDMS_p dms; +} OBIDMS_avl_group_t, *OBIDMS_avl_group_p; + + +OBIDMS_avl_group_p obi_create_avl_group(OBIDMS_p dms, const char* avl_name); +index_t insert_in_avl_group(OBIDMS_avl_group_p avl_group, byte_t* value); + + + /** * @brief Checks if an AVL tree already exists or not. * diff --git a/src/obidmscolumn.c b/src/obidmscolumn.c index 4eaf0f3..13a5169 100644 --- a/src/obidmscolumn.c +++ b/src/obidmscolumn.c @@ -525,7 +525,6 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, OBIDMS_column_p new_column; OBIDMS_column_directory_p column_directory; OBIDMS_column_header_p header; - OBIDMS_avl_p avl; size_t file_size; obiversion_t version_number; char* column_file_name; @@ -727,16 +726,15 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, // If the data type is OBI_STR or OBI_SEQ, the associated obi_avl is opened or created if ((returned_data_type == OBI_STR) || (returned_data_type == OBI_SEQ)) { - avl = obi_avl(dms, avl_name); - if (avl == NULL) - { - obidebug(1, "\nError opening or creating the aVL tree associated with a column"); - munmap(new_column->header, header_size); - close(column_file_descriptor); - free(new_column); - return NULL; - } - new_column->avl = avl; + new_column->avl = obi_create_avl_group(dms, avl_name); +// if (avl == NULL) TODO +// { +// obidebug(1, "\nError opening or creating the aVL tree associated with a column"); +// munmap(new_column->header, header_size); +// close(column_file_descriptor); +// free(new_column); +// return NULL; +// } strncpy(header->avl_name, avl_name, AVL_MAX_NAME); } @@ -760,11 +758,11 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms, { OBIDMS_column_p column; OBIDMS_column_directory_p column_directory; - OBIDMS_avl_p avl; char* column_file_name; int column_file_descriptor; size_t header_size; size_t i; + OBIDMS_avl_p avl; column = NULL; @@ -890,7 +888,7 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms, free(column); return NULL; } - column->avl = avl; + //column->avl = avl; TODO } close(column_file_descriptor); @@ -1024,12 +1022,12 @@ int obi_close_column(OBIDMS_column_p column) } } - // If the data type is OBI_STR or OBI_SEQ, the associated AVL tree is closed - if (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ)) - { - if (obi_close_avl(column->avl) < 0) - return -1; - } + // If the data type is OBI_STR or OBI_SEQ, the associated AVL tree is closed TODO +// if (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ)) +// { +// if (obi_close_avl(column->avl) < 0) +// return -1; +// } // Munmap data if (munmap(column->data, (column->header)->data_size) < 0) diff --git a/src/obidmscolumn.h b/src/obidmscolumn.h index c5d355b..39d9e04 100644 --- a/src/obidmscolumn.h +++ b/src/obidmscolumn.h @@ -98,7 +98,7 @@ typedef struct OBIDMS_column { */ OBIDMS_column_header_p header; /**< A pointer to the header of the column. */ - OBIDMS_avl_p avl; /**< A pointer to the AVL tree associated with the column if there is one. + OBIDMS_avl_group_p avl; /**< TODO A pointer to the group of AVL trees associated with the column if there is one. */ void* data; /**< A `void` pointer to the beginning of the data. * diff --git a/src/obidmscolumn_seq.c b/src/obidmscolumn_seq.c index 789b859..ffc2abe 100644 --- a/src/obidmscolumn_seq.c +++ b/src/obidmscolumn_seq.c @@ -61,8 +61,13 @@ int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, if (value_b == NULL) return -1; + //if (strlen(value_b) == 0) + // fprintf(stderr, "\nPOUIC"); + + //fprintf(stderr, "\n>%s||%s", value, obi_obibytes_to_seq(value_b)); + // Add in the AVL tree - idx = obi_avl_add(column->avl, value_b); + idx = insert_in_avl_group(column->avl, value_b); if (idx == -1) return -1; @@ -130,7 +135,7 @@ const char* obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column, index_t l if (idx == OBIIdx_NA) return OBISeq_NA; - value_b = obi_avl_get(column->avl, idx); + //value_b = obi_avl_get((column->avl)[crc(value)], idx); return obi_obibytes_to_seq(value_b); } diff --git a/src/obidmscolumn_str.c b/src/obidmscolumn_str.c index c92ca1d..5a6fdad 100644 --- a/src/obidmscolumn_str.c +++ b/src/obidmscolumn_str.c @@ -61,7 +61,7 @@ int obi_column_set_obistr_with_elt_idx(OBIDMS_column_p column, index_t line_nb, return -1; // Add in the AVL tree - idx = obi_avl_add(column->avl, value_b); + idx = insert_in_avl_group(column->avl, value_b); if (idx == -1) return -1; @@ -129,7 +129,7 @@ const char* obi_column_get_obistr_with_elt_idx(OBIDMS_column_p column, index_t l if (idx == OBIIdx_NA) return OBIStr_NA; - value_b = obi_avl_get(column->avl, idx); + //value_b = obi_avl_get(column->avl, idx); return obi_obibytes_to_str(value_b); } diff --git a/src/obiview.c b/src/obiview.c index 73da4be..953b90a 100644 --- a/src/obiview.c +++ b/src/obiview.c @@ -228,6 +228,7 @@ Obiview_p obi_new_view_nuc_seqs(OBIDMS_p dms, const char* view_name, Obiview_p v if (view== NULL) return NULL; + fprintf(stderr, "\nmmmm\n"); strcpy(view->view_type, VIEW_TYPE_NUC_SEQS); if (view_to_clone == NULL)