diff --git a/python/obitools3/obidms/_obidms.cfiles b/python/obitools3/obidms/_obidms.cfiles index 939bafa..1fcbfd6 100644 --- a/python/obitools3/obidms/_obidms.cfiles +++ b/python/obitools3/obidms/_obidms.cfiles @@ -22,3 +22,7 @@ ../../../src/obidmscolumn_idx.c ../../../src/obidms_taxonomy.c ../../../src/obidms_taxonomy.h +../../../src/bloom.c +../../../src/bloom.h +../../../src/MurmurHash2.c +../../../src/murmurhash2.h \ No newline at end of file diff --git a/src/MurmurHash2.c b/src/MurmurHash2.c new file mode 100755 index 0000000..32c4c32 --- /dev/null +++ b/src/MurmurHash2.c @@ -0,0 +1,64 @@ +//----------------------------------------------------------------------------- +// MurmurHash2, by Austin Appleby + +// Note - This code makes a few assumptions about how your machine behaves - + +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 + +// And it has a few limitations - + +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. + +unsigned int murmurhash2(const void * key, int len, const unsigned int seed) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} diff --git a/src/bloom.c b/src/bloom.c new file mode 100755 index 0000000..ab60aa6 --- /dev/null +++ b/src/bloom.c @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012-2015, Jyri J. Virkki + * All rights reserved. + * + * This file is under BSD license. See LICENSE file. + */ + +/* + * Refer to bloom.h for documentation on the public interfaces. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bloom.h" +#include "murmurhash2.h" + +#define MAKESTRING(n) STRING(n) +#define STRING(n) #n + +#ifdef __linux__ +unsigned detect_bucket_size(unsigned fallback_size); +#endif + + +static int test_bit_set_bit(unsigned char * buf, unsigned int x, int set_bit) +{ + register uint32_t * word_buf = (uint32_t *)buf; + register unsigned int offset = x >> 5; + register uint32_t word = word_buf[offset]; + register unsigned int mask = 1 << (x % 32); + + if (word & mask) { + return 1; + } else { + if (set_bit) { + word_buf[offset] = word | mask; + } + return 0; + } +} + + +static int bloom_check_add(struct bloom * bloom, + const void * buffer, int len, int add) +{ + if (bloom->ready == 0) { + (void)printf("bloom at %p not initialized!\n", (void *)bloom); + return -1; + } + + int hits = 0; + register unsigned int a = murmurhash2(buffer, len, 0x9747b28c); + register unsigned int b = murmurhash2(buffer, len, a); + register unsigned int x; + register unsigned int i; + + unsigned bucket_index = (a % bloom->buckets); + + unsigned char * bucket_ptr = + (bloom->bf + (bucket_index << bloom->bucket_bytes_exponent)); + + for (i = 0; i < bloom->hashes; i++) { + x = (a + i*b) & bloom->bucket_bits_fast_mod_operand; + if (test_bit_set_bit(bucket_ptr, x, add)) { + hits++; + } + } + + if (hits == bloom->hashes) { + return 1; // 1 == element already in (or collision) + } + + return 0; +} + + +static void setup_buckets(struct bloom * bloom, unsigned int cache_size) +{ + // If caller passed a non-zero cache_size, use it as given, otherwise + // either compute it or use built-in default + + if (cache_size == 0) { +#ifdef __linux__ + cache_size = detect_bucket_size(BLOOM_BUCKET_SIZE_FALLBACK); +#else + cache_size = BLOOM_BUCKET_SIZE_FALLBACK; +#endif + } + + bloom->buckets = (bloom->bytes / cache_size); + bloom->bucket_bytes = cache_size; + + // make sure bloom buffer bytes and bucket_bytes are even + int not_even_by = (bloom->bytes % bloom->bucket_bytes); + + if (not_even_by) { + // adjust bytes + bloom->bytes += (bloom->bucket_bytes - not_even_by); + assert((bloom->bytes % bloom->bucket_bytes) == 0); // Should get even + + // adjust bits + bloom->bits = bloom->bytes * 8; + + // adjust bits per element + bloom->bpe = bloom->bits*1. / bloom->entries; + + // adjust buckets + bloom->buckets++; + } + + bloom->bucket_bytes_exponent = __builtin_ctz(cache_size); + bloom->bucket_bits_fast_mod_operand = (cache_size * 8 - 1); +} + + +int bloom_init_size(struct bloom * bloom, int entries, double error, + unsigned int cache_size) +{ + bloom->ready = 0; + + if (entries < 1 || error == 0) { + return 1; + } + + bloom->entries = entries; + bloom->error = error; + + double num = log(bloom->error); + double denom = 0.480453013918201; // ln(2)^2 + bloom->bpe = -(num / denom); + + double dentries = (double)entries; + bloom->bits = (int)(dentries * bloom->bpe); + + if (bloom->bits % 8) { + bloom->bytes = (bloom->bits / 8) + 1; + } else { + bloom->bytes = bloom->bits / 8; + } + + bloom->hashes = (int)ceil(0.693147180559945 * bloom->bpe); // ln(2) + + setup_buckets(bloom, cache_size); + + bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char)); + if (bloom->bf == NULL) { + return 1; + } + + bloom->ready = 1; + return 0; +} + + +int bloom_init(struct bloom * bloom, int entries, double error) +{ + return bloom_init_size(bloom, entries, error, 0); +} + + +int bloom_check(struct bloom * bloom, const void * buffer, int len) +{ + return bloom_check_add(bloom, buffer, len, 0); +} + + +int bloom_add(struct bloom * bloom, const void * buffer, int len) +{ + return bloom_check_add(bloom, buffer, len, 1); +} + + +void bloom_print(struct bloom * bloom) +{ + (void)printf("bloom at %p\n", (void *)bloom); + (void)printf(" ->entries = %d\n", bloom->entries); + (void)printf(" ->error = %f\n", bloom->error); + (void)printf(" ->bits = %d\n", bloom->bits); + (void)printf(" ->bits per elem = %f\n", bloom->bpe); + (void)printf(" ->bytes = %d\n", bloom->bytes); + (void)printf(" ->buckets = %u\n", bloom->buckets); + (void)printf(" ->bucket_bytes = %u\n", bloom->bucket_bytes); + (void)printf(" ->bucket_bytes_exponent = %u\n", + bloom->bucket_bytes_exponent); + (void)printf(" ->bucket_bits_fast_mod_operand = 0%o\n", + bloom->bucket_bits_fast_mod_operand); + (void)printf(" ->hash functions = %d\n", bloom->hashes); +} + + +void bloom_free(struct bloom * bloom) +{ + if (bloom->ready) { + free(bloom->bf); + } + bloom->ready = 0; +} + + +const char * bloom_version() +{ + return MAKESTRING(BLOOM_VERSION); +} diff --git a/src/bloom.h b/src/bloom.h new file mode 100755 index 0000000..203584a --- /dev/null +++ b/src/bloom.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2012-2015, Jyri J. Virkki + * All rights reserved. + * + * This file is under BSD license. See LICENSE file. + */ + +#ifndef _BLOOM_H +#define _BLOOM_H + + +/** *************************************************************************** + * On Linux, the code attempts to compute a bucket size based on CPU cache + * size info, if available. If that fails for any reason, this fallback size + * is used instead. + * + * On non-Linux systems, this is the bucket size always used unless the + * caller overrides it (see bloom_init_size()). + * + */ +#define BLOOM_BUCKET_SIZE_FALLBACK (32 * 1024) + + +/** *************************************************************************** + * It was found that using multiplier x0.5 for CPU L1 cache size is + * more effective in terms of CPU usage and, surprisingly, collisions + * number. + * + * Feel free to tune this constant the way it will work for you. + * + */ +#define BLOOM_L1_CACHE_SIZE_DIV 1 + + +/** *************************************************************************** + * Structure to keep track of one bloom filter. Caller needs to + * allocate this and pass it to the functions below. First call for + * every struct must be to bloom_init(). + * + */ +struct bloom +{ + // These fields are part of the public interface of this structure. + // Client code may read these values if desired. Client code MUST NOT + // modify any of these. + int entries; + double error; + int bits; + int bytes; + int hashes; + + // Fields below are private to the implementation. These may go away or + // change incompatibly at any moment. Client code MUST NOT access or rely + // on these. + unsigned buckets; + unsigned bucket_bytes; + + // x86 CPU divide by/multiply by operation optimization helpers + unsigned bucket_bytes_exponent; + unsigned bucket_bits_fast_mod_operand; + + double bpe; + unsigned char * bf; + int ready; +}; + + +/** *************************************************************************** + * Initialize the bloom filter for use. + * + * The filter is initialized with a bit field and number of hash functions + * according to the computations from the wikipedia entry: + * http://en.wikipedia.org/wiki/Bloom_filter + * + * Optimal number of bits is: + * bits = (entries * ln(error)) / ln(2)^2 + * + * Optimal number of hash functions is: + * hashes = bpe * ln(2) + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * entries - The expected number of entries which will be inserted. + * error - Probability of collision (as long as entries are not + * exceeded). + * + * Return: + * ------- + * 0 - on success + * 1 - on failure + * + */ +int bloom_init(struct bloom * bloom, int entries, double error); + + +/** *************************************************************************** + * Initialize the bloom filter for use. + * + * See comments above for general information. + * + * This is the same as bloom_init() but allows the caller to pass in a + * cache_size to override the internal value (which is either computed + * or the default of BLOOM_BUCKET_SIZE_FALLBACK). Mostly useful for + * experimenting. + * + * See misc/bucketsize for a script which can help identify a good value + * for cache_size. + * + */ +int bloom_init_size(struct bloom * bloom, int entries, double error, + unsigned int cache_size); + + +/** *************************************************************************** + * Check if the given element is in the bloom filter. Remember this may + * return false positive if a collision occured. + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * buffer - Pointer to buffer containing element to check. + * len - Size of 'buffer'. + * + * Return: + * ------- + * 0 - element is not present + * 1 - element is present (or false positive due to collision) + * -1 - bloom not initialized + * + */ +int bloom_check(struct bloom * bloom, const void * buffer, int len); + + +/** *************************************************************************** + * Add the given element to the bloom filter. + * The return code indicates if the element (or a collision) was already in, + * so for the common check+add use case, no need to call check separately. + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * buffer - Pointer to buffer containing element to add. + * len - Size of 'buffer'. + * + * Return: + * ------- + * 0 - element was not present and was added + * 1 - element (or a collision) had already been added previously + * -1 - bloom not initialized + * + */ +int bloom_add(struct bloom * bloom, const void * buffer, int len); + + +/** *************************************************************************** + * Print (to stdout) info about this bloom filter. Debugging aid. + * + */ +void bloom_print(struct bloom * bloom); + + +/** *************************************************************************** + * Deallocate internal storage. + * + * Upon return, the bloom struct is no longer usable. You may call bloom_init + * again on the same struct to reinitialize it again. + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * + * Return: none + * + */ +void bloom_free(struct bloom * bloom); + + +/** *************************************************************************** + * Returns version string compiled into library. + * + * Return: version string + * + */ +const char * bloom_version(); + + +#endif diff --git a/src/murmurhash2.h b/src/murmurhash2.h new file mode 100755 index 0000000..e607381 --- /dev/null +++ b/src/murmurhash2.h @@ -0,0 +1,7 @@ + +#ifndef _BLOOM_MURMURHASH2 +#define _BLOOM_MURMURHASH2 + +unsigned int murmurhash2(const void * key, int len, const unsigned int seed); + +#endif diff --git a/src/obiavl.c b/src/obiavl.c index 50022cf..41e66fe 100644 --- a/src/obiavl.c +++ b/src/obiavl.c @@ -19,6 +19,9 @@ #include #include +//#include +#include "bloom.h" + #include "obiavl.h" #include "obierrno.h" #include "obitypes.h" @@ -30,158 +33,6 @@ #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) -////crc crcTable[256]; -//static crc crcTable[] = { -//0x00, 0xd8, 0x68, 0xb0, 0xd0, 0x8, 0xb8, 0x60, 0x78, 0xa0, 0x10, 0xc8, 0xa8, 0x70, 0xc0, 0x18, 0xf0, 0x28, 0x98, 0x40, 0x20, 0xf8, 0x48, 0x90, 0x88, 0x50, 0xe0, 0x38, 0x58, 0x80, 0x30, 0xe8, 0x38, 0xe0, 0x50, 0x88, 0xe8, 0x30, 0x80, 0x58, 0x40, 0x98, 0x28, 0xf0, 0x90, 0x48, 0xf8, 0x20, 0xc8, 0x10, 0xa0, 0x78, 0x18, 0xc0, 0x70, 0xa8, 0xb0, 0x68, 0xd8, 0, 0x60, 0xb8, 0x8, 0xd0, 0x70, 0xa8, 0x18, 0xc0, 0xa0, 0x78, 0xc8, 0x10, 0x8, 0xd0, 0x60, 0xb8, 0xd8, 0, 0xb0, 0x68, 0x80, 0x58, 0xe8, 0x30, 0x50, 0x88, 0x38, 0xe0, 0xf8, 0x20, 0x90, 0x48, 0x28, 0xf0, 0x40, 0x98, 0x48, 0x90, 0x20, 0xf8, 0x98, 0x40, 0xf0, 0x28, 0x30, 0xe8, 0x58, 0x80, 0xe0, 0x38, 0x88, 0x50, 0xb8, 0x60, 0xd0, 0x8, 0x68, 0xb0, 0, 0xd8, 0xc0, 0x18, 0xa8, 0x70, 0x10, 0xc8, 0x78, 0xa0, 0xe0, 0x38, 0x88, 0x50, 0x30, 0xe8, 0x58, 0x80, 0x98, 0x40, 0xf0, 0x28, 0x48, 0x90, 0x20, 0xf8, 0x10, 0xc8, 0x78, 0xa0, 0xc0, 0x18, 0xa8, 0x70, 0x68, 0xb0, 0, 0xd8, 0xb8, 0x60, 0xd0, 0x8, 0xd8, 0, 0xb0, 0x68, 0x8, 0xd0, 0x60, 0xb8, 0xa0, 0x78, 0xc8, 0x10, 0x70, 0xa8, 0x18, 0xc0, 0x28, 0xf0, 0x40, 0x98, 0xf8, 0x20, 0x90, 0x48, 0x50, 0x88, 0x38, 0xe0, 0x80, 0x58, 0xe8, 0x30, 0x90, 0x48, 0xf8, 0x20, 0x40, 0x98, 0x28, 0xf0, 0xe8, 0x30, 0x80, 0x58, 0x38, 0xe0, 0x50, 0x88, 0x60, 0xb8, 0x8, 0xd0, 0xb0, 0x68, 0xd8, 0, 0x18, 0xc0, 0x70, 0xa8, 0xc8, 0x10, 0xa0, 0x78, 0xa8, 0x70, 0xc0, 0x18, 0x78, 0xa0, 0x10, 0xc8, 0xd0, 0x8, 0xb8, 0x60, 0, 0xd8, 0x68, 0xb0, 0x58, 0x80, 0x30, 0xe8, 0x88, 0x50, 0xe0, 0x38, 0x20, 0xf8, 0x48, 0x90, 0xf0, 0x28, 0x98, 0x40 -//}; -// -// -//void crcInit(void) -//{ -// crc remainder; -// -// fprintf(stderr, "\n"); -// -// /* -// * Compute the remainder of each possible dividend. -// */ -// for (int dividend = 0; dividend < 256; ++dividend) -// { -// /* -// * Start with the dividend followed by zeros. -// */ -// remainder = dividend << (WIDTH - 8); -// -// /* -// * Perform modulo-2 division, a bit at a time. -// */ -// for (uint8_t bit = 8; bit > 0; --bit) -// { -// /* -// * Try to divide the current data bit. -// */ -// if (remainder & TOPBIT) -// { -// remainder = (remainder << 1) ^ POLYNOMIAL; -// } -// else -// { -// remainder = (remainder << 1); -// } -// } -// -// /* -// * Store the result into the table. -// */ -// crcTable[dividend] = remainder; -// fprintf(stderr, "%#x, ", remainder); -// } -// -//} /* crcInit() */ -// -// -//crc crcFast(uint8_t const message[], int nBytes) -//{ -// uint8_t data; -// crc remainder = 0; -// -// -// /* -// * Divide the message by the polynomial, a byte at a time. -// */ -// for (int byte = 0; byte < nBytes; ++byte) -// { -// data = message[byte] ^ (remainder >> (WIDTH - 8)); -// remainder = crcTable[data] ^ (remainder << 8); -// } -// -// /* -// * The final remainder is the CRC. -// */ -// return (remainder); -// -//} /* crcFast() */ -// -// -//crc compute_crc(const char* s) -//{ -// crc c; -// //uint8_t cache; -// -// //cache = 15; -// -//// crcInit(); -// -// c = crcFast(s, strlen(s)); -// -// //fprintf(stderr, "\nlen = %d", strlen(argv[1])); -// -// //fprintf(stderr, "\ncrc = %u\n\n", c); -// //fprintf(stderr, "\ncrc mod 8 = %u\n\n", c%8); -// -// c = c >> 3; -// //fprintf(stderr, "\nshifted crc = %u\n\n", c); -// -// //c = c & cache; -// //c = c % 32; -// -// //fprintf(stderr, "\ncrc = %u\n\n", c); -// -// return (c & 7); -//} - -static unsigned char crc8_table[] = { - 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0x95, 0xab, 0xe9, 0xd7, - 0x6d, 0x53, 0x11, 0x2f, 0x4f, 0x71, 0x33, 0x0d, 0xb7, 0x89, 0xcb, 0xf5, - 0xda, 0xe4, 0xa6, 0x98, 0x22, 0x1c, 0x5e, 0x60, 0x9e, 0xa0, 0xe2, 0xdc, - 0x66, 0x58, 0x1a, 0x24, 0x0b, 0x35, 0x77, 0x49, 0xf3, 0xcd, 0x8f, 0xb1, - 0xd1, 0xef, 0xad, 0x93, 0x29, 0x17, 0x55, 0x6b, 0x44, 0x7a, 0x38, 0x06, - 0xbc, 0x82, 0xc0, 0xfe, 0x59, 0x67, 0x25, 0x1b, 0xa1, 0x9f, 0xdd, 0xe3, - 0xcc, 0xf2, 0xb0, 0x8e, 0x34, 0x0a, 0x48, 0x76, 0x16, 0x28, 0x6a, 0x54, - 0xee, 0xd0, 0x92, 0xac, 0x83, 0xbd, 0xff, 0xc1, 0x7b, 0x45, 0x07, 0x39, - 0xc7, 0xf9, 0xbb, 0x85, 0x3f, 0x01, 0x43, 0x7d, 0x52, 0x6c, 0x2e, 0x10, - 0xaa, 0x94, 0xd6, 0xe8, 0x88, 0xb6, 0xf4, 0xca, 0x70, 0x4e, 0x0c, 0x32, - 0x1d, 0x23, 0x61, 0x5f, 0xe5, 0xdb, 0x99, 0xa7, 0xb2, 0x8c, 0xce, 0xf0, - 0x4a, 0x74, 0x36, 0x08, 0x27, 0x19, 0x5b, 0x65, 0xdf, 0xe1, 0xa3, 0x9d, - 0xfd, 0xc3, 0x81, 0xbf, 0x05, 0x3b, 0x79, 0x47, 0x68, 0x56, 0x14, 0x2a, - 0x90, 0xae, 0xec, 0xd2, 0x2c, 0x12, 0x50, 0x6e, 0xd4, 0xea, 0xa8, 0x96, - 0xb9, 0x87, 0xc5, 0xfb, 0x41, 0x7f, 0x3d, 0x03, 0x63, 0x5d, 0x1f, 0x21, - 0x9b, 0xa5, 0xe7, 0xd9, 0xf6, 0xc8, 0x8a, 0xb4, 0x0e, 0x30, 0x72, 0x4c, - 0xeb, 0xd5, 0x97, 0xa9, 0x13, 0x2d, 0x6f, 0x51, 0x7e, 0x40, 0x02, 0x3c, - 0x86, 0xb8, 0xfa, 0xc4, 0xa4, 0x9a, 0xd8, 0xe6, 0x5c, 0x62, 0x20, 0x1e, - 0x31, 0x0f, 0x4d, 0x73, 0xc9, 0xf7, 0xb5, 0x8b, 0x75, 0x4b, 0x09, 0x37, - 0x8d, 0xb3, 0xf1, 0xcf, 0xe0, 0xde, 0x9c, 0xa2, 0x18, 0x26, 0x64, 0x5a, - 0x3a, 0x04, 0x46, 0x78, 0xc2, 0xfc, 0xbe, 0x80, 0xaf, 0x91, 0xd3, 0xed, - 0x57, 0x69, 0x2b, 0x15}; - - -unsigned crc8(unsigned char *data, size_t len) -{ - unsigned char *end; - unsigned crc; - - crc = 0; - - crc ^= 0xff; - end = data + len; - do { - crc = crc8_table[crc ^ *data++]; - } while (data < end); - return crc ^ 0xff; -} - -crc compute_crc(const char* s) -{ - unsigned c; - - c = crc8(s, strlen(s)); - //fprintf(stderr, "%02x\n", c); - return (c & 7); -} - - - - /************************************************************************** * @@ -620,21 +471,23 @@ int grow_avl(OBIDMS_avl_p avl) // TODO Lock when needed int avl_file_descriptor; char* avl_file_name; - // Get the avl file name - avl_file_name = build_avl_file_name((avl->header)->avl_name); - if (avl_file_name == NULL) - return -1; + avl_file_descriptor = avl->avl_fd; - // Open the avl file - avl_file_descriptor = openat(avl->dir_fd, avl_file_name, O_RDWR); - if (avl_file_descriptor < 0) - { - obi_set_errno(OBI_AVL_ERROR); - obidebug(1, "\nError opening an AVL tree file"); - free(avl_file_name); - return -1; - } - free(avl_file_name); +// // Get the avl file name +// avl_file_name = build_avl_file_name((avl->header)->avl_name); +// if (avl_file_name == NULL) +// return -1; +// +// // Open the avl file +// avl_file_descriptor = openat(avl->dir_fd, avl_file_name, O_RDWR); +// if (avl_file_descriptor < 0) +// { +// obi_set_errno(OBI_AVL_ERROR); +// obidebug(1, "\nError opening an AVL tree file"); +// free(avl_file_name); +// return -1; +// } +// free(avl_file_name); // Calculate the new file size old_data_size = (avl->header)->avl_size; @@ -683,7 +536,7 @@ int grow_avl(OBIDMS_avl_p avl) // TODO Lock when needed // Set the new avl size (avl->header)->avl_size = new_data_size; - close(avl_file_descriptor); + //close(avl_file_descriptor); return 0; } @@ -698,21 +551,23 @@ int grow_avl_data(OBIDMS_avl_p avl) // TODO Lock when needed int avl_data_file_descriptor; char* avl_data_file_name; - // Get the avl data file name - avl_data_file_name = build_avl_data_file_name((avl->header)->avl_name); - if (avl_data_file_name == NULL) - return -1; + avl_data_file_descriptor = avl->data_fd; - // Open the avl data file - avl_data_file_descriptor = openat(avl->dir_fd, avl_data_file_name, O_RDWR); - if (avl_data_file_descriptor < 0) - { - obi_set_errno(OBI_AVL_ERROR); - obidebug(1, "\nError opening an AVL tree data file"); - free(avl_data_file_name); - return -1; - } - free(avl_data_file_name); +// // Get the avl data file name +// avl_data_file_name = build_avl_data_file_name((avl->header)->avl_name); +// if (avl_data_file_name == NULL) +// return -1; +// +// // Open the avl data file +// avl_data_file_descriptor = openat(avl->dir_fd, avl_data_file_name, O_RDWR); +// if (avl_data_file_descriptor < 0) +// { +// obi_set_errno(OBI_AVL_ERROR); +// obidebug(1, "\nError opening an AVL tree data file"); +// free(avl_data_file_name); +// return -1; +// } +// free(avl_data_file_name); // Calculate the new file size old_data_size = ((avl->data)->header)->data_size_max; @@ -763,7 +618,7 @@ int grow_avl_data(OBIDMS_avl_p avl) // TODO Lock when needed // Initialize new data to 0 memset(((avl->data)->data)+old_data_size, 0, new_data_size - old_data_size); - close(avl_data_file_descriptor); + //close(avl_data_file_descriptor); return 0; } @@ -1131,21 +986,71 @@ OBIDMS_avl_p obi_avl(OBIDMS_p dms, const char* avl_name) } -OBIDMS_avl_p* obi_create_avl_in_64_parts(OBIDMS_p dms, const char* avl_name) +OBIDMS_avl_group_p obi_create_avl_group(OBIDMS_p dms, const char* avl_name) { - OBIDMS_avl_p* avls; + OBIDMS_avl_group_p avl_group; char* avl_name_with_idx; - uint8_t i; - avls = (OBIDMS_avl_p*) malloc(64*sizeof(OBIDMS_avl_p)); + avl_group = (OBIDMS_avl_group_p) malloc(sizeof(OBIDMS_avl_group_t)); - for (i=0; i < 64; i++) - { - asprintf(&avl_name_with_idx,"%s_%u", avl_name, i); - avls[i] = obi_create_avl(dms, avl_name_with_idx); - } + // Create 1st avl + asprintf(&avl_name_with_idx,"%s_%u", avl_name, 0); + (avl_group->sub_avls)[0] = obi_create_avl(dms, avl_name_with_idx); + avl_group->current_avl_idx = 0; + strcpy(avl_group->avl_name, avl_name); - return avls; + avl_group->dms = dms; + + return avl_group; +} + + +int unmap_an_avl(OBIDMS_avl_p avl) +{ + if (munmap((avl->data)->data, ((avl->data)->header)->data_size_max) < 0) + return -1; + if (munmap(avl->tree, (((avl->header)->nb_items_max) * sizeof(AVL_node_t))) < 0) + return -1; + return 0; +} + + +int remap_an_avl(OBIDMS_avl_p avl) +{ + (avl->data)->data = mmap(NULL, + ((avl->data)->header)->data_size_max, + PROT_READ | PROT_WRITE, + MAP_SHARED, + avl->data_fd, + ((avl->data)->header)->header_size); + if ((avl->data)->data == NULL) + return -1; + + avl->tree = mmap(NULL, + ((avl->header)->nb_items_max) * sizeof(AVL_node_t), + PROT_READ | PROT_WRITE, + MAP_SHARED, + avl->avl_fd, + (avl->header)->header_size); + if (avl->tree == NULL) + return -1; + + return 0; +} + + +int obi_add_new_avl_in_group(OBIDMS_avl_group_p avl_group) // TODO check for errors +{ + char* avl_name_with_idx; + + // unmap older + unmap_an_avl((avl_group->sub_avls)[avl_group->current_avl_idx]); + + (avl_group->current_avl_idx)++; + asprintf(&avl_name_with_idx,"%s_%u", avl_group->avl_name, avl_group->current_avl_idx); + (avl_group->sub_avls)[avl_group->current_avl_idx] = obi_create_avl(avl_group->dms, avl_name_with_idx); + + return 0; } @@ -1251,7 +1156,7 @@ OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name) // Initialize all bits to 0 memset(avl_data->data, 0, (avl_data->header)->data_size_max); - close(avl_data_file_descriptor); + //close(avl_data_file_descriptor); // Create the AVL tree file @@ -1351,7 +1256,13 @@ OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name) (avl->header)->creation_date = time(NULL); strcpy((avl->header)->avl_name, avl_name); - close(avl_file_descriptor); + avl->avl_fd = avl_file_descriptor; + avl->data_fd = avl_data_file_descriptor; + + // Bloom filter + bloom_init(&((avl->header)->bloom_filter), 2000000, 0.001); // TODO use macros + + //close(avl_file_descriptor); // Add in the list of opened AVL trees *(((dms->opened_avls)->avls)+((dms->opened_avls)->nb_opened_avls)) = avl; @@ -1458,7 +1369,7 @@ OBIDMS_avl_p obi_open_avl(OBIDMS_p dms, const char* avl_name) return NULL; } - close(avl_data_file_descriptor); + //close(avl_data_file_descriptor); // Open the AVL tree file @@ -1544,7 +1455,10 @@ OBIDMS_avl_p obi_open_avl(OBIDMS_p dms, const char* avl_name) avl->directory = dms->avl_directory; avl->dir_fd = avl_dir_file_descriptor; - close(avl_file_descriptor); + avl->avl_fd = avl_file_descriptor; + avl->data_fd = avl_data_file_descriptor; + + //close(avl_file_descriptor); // Add in the list of opened AVL trees *(((dms->opened_avls)->avls)+((dms->opened_avls)->nb_opened_avls)) = avl; @@ -1609,6 +1523,53 @@ byte_t* obi_avl_get(OBIDMS_avl_p avl, index_t idx) } +int maybe_in_avl(OBIDMS_avl_p avl, byte_t* value) +{ + return (bloom_check(&((avl->header)->bloom_filter), value, (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1))))); +} + + +index_t insert_in_avl_group(OBIDMS_avl_group_p avl_group, byte_t* value) // TODO won't be index_t +{ + index_t index_if_already_in; + int i; + + if (maybe_in_avl((avl_group->sub_avls)[avl_group->current_avl_idx], value)) + { + //fprintf(stderr, "\nyah maybe"); + index_if_already_in = obi_avl_find((avl_group->sub_avls)[avl_group->current_avl_idx], value); + if (index_if_already_in >= 0) + return index_if_already_in; + } +// else +// fprintf(stderr, "\nnah"); + for (i=0; i < (avl_group->current_avl_idx); i++) + { + if (maybe_in_avl((avl_group->sub_avls)[i], value)) + { + //fprintf(stderr, "\nyah maybe"); + if (remap_an_avl((avl_group->sub_avls)[i]) < 0) + return -1; + index_if_already_in = obi_avl_find((avl_group->sub_avls)[i], value); + if (unmap_an_avl((avl_group->sub_avls)[i]) < 0) + return -1; + if (index_if_already_in >= 0) + return index_if_already_in; + } +// else +// fprintf(stderr, "\nnah"); + } + + // not found in any avl: add in current + // first, check if make new one + if ((((avl_group->sub_avls)[avl_group->current_avl_idx])->header)->nb_items == 2000000) // TODO add condition with data size + use macro + obi_add_new_avl_in_group(avl_group); + + bloom_add(&((((avl_group->sub_avls)[avl_group->current_avl_idx])->header)->bloom_filter), value, (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1)))); + return obi_avl_add((avl_group->sub_avls)[avl_group->current_avl_idx], value); +} + + // Insert a new node index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value) { @@ -1674,7 +1635,7 @@ index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value) // Value already stored { //fprintf(stderr, "\n>>>ALREADY IN, %s, %lld\n", obi_obibytes_to_seq(value), (avl->header)->nb_items); - return current_node->value; + return current_node->value; // TODO should trigger error if using bloom filters } depth++; @@ -1732,7 +1693,7 @@ index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value) } -// Find if a value is already in an AVL tree +// Find if a value is already in an AVL tree TODO use bloom index_t obi_avl_find(OBIDMS_avl_p avl, byte_t* value) { int comp; diff --git a/src/obiavl.h b/src/obiavl.h index 1369b8f..1850455 100644 --- a/src/obiavl.h +++ b/src/obiavl.h @@ -25,6 +25,10 @@ #include "obidms.h" #include "obitypes.h" +#include "bloom.h" + +#define NB_OF_AVLS (64) +#define MASK (63) #define AVL_MAX_NAME (1024) /**< The maximum length of an AVL tree name. */ @@ -39,6 +43,8 @@ #define BYTE_ARRAY_HEADER_SIZE (9) /**< The size of the header of a byte array. */ +typedef struct bloom bloom_t; + /** * @brief AVL tree node structure. @@ -48,7 +54,7 @@ typedef struct AVL_node { */ index_t right_child; /**< Index of right greater child node. */ - int8_t balance_factor; /**< Balance factor of the node. + int8_t balance_factor; /**< Balance factor of the node. */ index_t value; /**< Index of the value associated with the node in the data array. */ @@ -103,6 +109,7 @@ typedef struct OBIDMS_avl_header { */ time_t creation_date; /**< Date of creation of the file. */ + bloom_t bloom_filter; } OBIDMS_avl_header_t, *OBIDMS_avl_header_p; @@ -132,9 +139,28 @@ typedef struct OBIDMS_avl { */ size_t counter; /**< Indicates by how many threads/programs (TODO) the AVL tree is used. */ + int avl_fd; + int data_fd; } OBIDMS_avl_t, *OBIDMS_avl_p; +/** + * @brief OBIDMS AVL tree group structure. + */ +typedef struct OBIDMS_avl_group { + // TODO put each group in a directory later + OBIDMS_avl_p sub_avls[64]; // TODO macro for max + int current_avl_idx; + char avl_name[AVL_MAX_NAME+1]; + OBIDMS_p dms; +} OBIDMS_avl_group_t, *OBIDMS_avl_group_p; + + +OBIDMS_avl_group_p obi_create_avl_group(OBIDMS_p dms, const char* avl_name); +index_t insert_in_avl_group(OBIDMS_avl_group_p avl_group, byte_t* value); + + + /** * @brief Checks if an AVL tree already exists or not. * @@ -340,12 +366,5 @@ byte_t* obi_seq_to_obibytes(char* seq); const char* obi_obibytes_to_seq(byte_t* value_b); -OBIDMS_avl_p* obi_create_avl_in_64_parts(OBIDMS_p dms, const char* avl_name); - -typedef uint8_t crc; - -crc compute_crc(const char* s); - - #endif /* OBIAVL_H_ */ diff --git a/src/obidmscolumn.c b/src/obidmscolumn.c index bfa8579..b0c1196 100644 --- a/src/obidmscolumn.c +++ b/src/obidmscolumn.c @@ -521,7 +521,6 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, OBIDMS_column_p new_column; OBIDMS_column_directory_p column_directory; OBIDMS_column_header_p header; - OBIDMS_avl_p* avl; size_t file_size; obiversion_t version_number; char* column_file_name; @@ -723,16 +722,15 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, // If the data type is OBI_STR or OBI_SEQ, the associated obi_avl is opened or created if ((returned_data_type == OBI_STR) || (returned_data_type == OBI_SEQ)) { - avl = obi_create_avl_in_64_parts(dms, avl_name); - if (avl == NULL) - { - obidebug(1, "\nError opening or creating the aVL tree associated with a column"); - munmap(new_column->header, header_size); - close(column_file_descriptor); - free(new_column); - return NULL; - } - memcpy(new_column->avl, avl, 64*sizeof(OBIDMS_avl_p)); + new_column->avl = obi_create_avl_group(dms, avl_name); +// if (avl == NULL) TODO +// { +// obidebug(1, "\nError opening or creating the aVL tree associated with a column"); +// munmap(new_column->header, header_size); +// close(column_file_descriptor); +// free(new_column); +// return NULL; +// } strncpy(header->avl_name, avl_name, AVL_MAX_NAME); } @@ -756,11 +754,11 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms, { OBIDMS_column_p column; OBIDMS_column_directory_p column_directory; - OBIDMS_avl_p avl; char* column_file_name; int column_file_descriptor; size_t header_size; size_t i; + OBIDMS_avl_p avl; column = NULL; diff --git a/src/obidmscolumn.h b/src/obidmscolumn.h index 00db5f7..39d9e04 100644 --- a/src/obidmscolumn.h +++ b/src/obidmscolumn.h @@ -98,7 +98,7 @@ typedef struct OBIDMS_column { */ OBIDMS_column_header_p header; /**< A pointer to the header of the column. */ - OBIDMS_avl_p avl[64]; /**< A pointer to the group of AVL trees associated with the column if there is one. + OBIDMS_avl_group_p avl; /**< TODO A pointer to the group of AVL trees associated with the column if there is one. */ void* data; /**< A `void` pointer to the beginning of the data. * diff --git a/src/obidmscolumn_seq.c b/src/obidmscolumn_seq.c index 8d329d7..ffc2abe 100644 --- a/src/obidmscolumn_seq.c +++ b/src/obidmscolumn_seq.c @@ -61,13 +61,13 @@ int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column, index_t line_nb, if (value_b == NULL) return -1; - if (strlen(value_b) == 0) - fprintf(stderr, "\nPOUIC"); + //if (strlen(value_b) == 0) + // fprintf(stderr, "\nPOUIC"); //fprintf(stderr, "\n>%s||%s", value, obi_obibytes_to_seq(value_b)); // Add in the AVL tree - idx = obi_avl_add((column->avl)[compute_crc(value)], value_b); + idx = insert_in_avl_group(column->avl, value_b); if (idx == -1) return -1; diff --git a/src/obidmscolumn_str.c b/src/obidmscolumn_str.c index d4a15f5..5a6fdad 100644 --- a/src/obidmscolumn_str.c +++ b/src/obidmscolumn_str.c @@ -61,7 +61,7 @@ int obi_column_set_obistr_with_elt_idx(OBIDMS_column_p column, index_t line_nb, return -1; // Add in the AVL tree - idx = obi_avl_add((column->avl)[compute_crc(value)], value_b); + idx = insert_in_avl_group(column->avl, value_b); if (idx == -1) return -1; diff --git a/src/obiview.c b/src/obiview.c index c700646..ef44955 100644 --- a/src/obiview.c +++ b/src/obiview.c @@ -227,6 +227,7 @@ Obiview_p obi_new_view_nuc_seqs(OBIDMS_p dms, const char* view_name, Obiview_p v if (view== NULL) return NULL; + fprintf(stderr, "\nmmmm\n"); strcpy(view->view_type, VIEW_TYPE_NUC_SEQS); if (view_to_clone == NULL)