Multiple AVLs with bloom filters (very raw test version)

This commit is contained in:
Celine Mercier
2016-03-18 11:06:02 +01:00
parent 545ed8111a
commit 3681cecb4d
12 changed files with 677 additions and 223 deletions

View File

@ -19,6 +19,9 @@
#include <fcntl.h>
#include <math.h>
//#include <libbloom.h>
#include "bloom.h"
#include "obiavl.h"
#include "obierrno.h"
#include "obitypes.h"
@ -30,158 +33,6 @@
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
////crc crcTable[256];
//static crc crcTable[] = {
//0x00, 0xd8, 0x68, 0xb0, 0xd0, 0x8, 0xb8, 0x60, 0x78, 0xa0, 0x10, 0xc8, 0xa8, 0x70, 0xc0, 0x18, 0xf0, 0x28, 0x98, 0x40, 0x20, 0xf8, 0x48, 0x90, 0x88, 0x50, 0xe0, 0x38, 0x58, 0x80, 0x30, 0xe8, 0x38, 0xe0, 0x50, 0x88, 0xe8, 0x30, 0x80, 0x58, 0x40, 0x98, 0x28, 0xf0, 0x90, 0x48, 0xf8, 0x20, 0xc8, 0x10, 0xa0, 0x78, 0x18, 0xc0, 0x70, 0xa8, 0xb0, 0x68, 0xd8, 0, 0x60, 0xb8, 0x8, 0xd0, 0x70, 0xa8, 0x18, 0xc0, 0xa0, 0x78, 0xc8, 0x10, 0x8, 0xd0, 0x60, 0xb8, 0xd8, 0, 0xb0, 0x68, 0x80, 0x58, 0xe8, 0x30, 0x50, 0x88, 0x38, 0xe0, 0xf8, 0x20, 0x90, 0x48, 0x28, 0xf0, 0x40, 0x98, 0x48, 0x90, 0x20, 0xf8, 0x98, 0x40, 0xf0, 0x28, 0x30, 0xe8, 0x58, 0x80, 0xe0, 0x38, 0x88, 0x50, 0xb8, 0x60, 0xd0, 0x8, 0x68, 0xb0, 0, 0xd8, 0xc0, 0x18, 0xa8, 0x70, 0x10, 0xc8, 0x78, 0xa0, 0xe0, 0x38, 0x88, 0x50, 0x30, 0xe8, 0x58, 0x80, 0x98, 0x40, 0xf0, 0x28, 0x48, 0x90, 0x20, 0xf8, 0x10, 0xc8, 0x78, 0xa0, 0xc0, 0x18, 0xa8, 0x70, 0x68, 0xb0, 0, 0xd8, 0xb8, 0x60, 0xd0, 0x8, 0xd8, 0, 0xb0, 0x68, 0x8, 0xd0, 0x60, 0xb8, 0xa0, 0x78, 0xc8, 0x10, 0x70, 0xa8, 0x18, 0xc0, 0x28, 0xf0, 0x40, 0x98, 0xf8, 0x20, 0x90, 0x48, 0x50, 0x88, 0x38, 0xe0, 0x80, 0x58, 0xe8, 0x30, 0x90, 0x48, 0xf8, 0x20, 0x40, 0x98, 0x28, 0xf0, 0xe8, 0x30, 0x80, 0x58, 0x38, 0xe0, 0x50, 0x88, 0x60, 0xb8, 0x8, 0xd0, 0xb0, 0x68, 0xd8, 0, 0x18, 0xc0, 0x70, 0xa8, 0xc8, 0x10, 0xa0, 0x78, 0xa8, 0x70, 0xc0, 0x18, 0x78, 0xa0, 0x10, 0xc8, 0xd0, 0x8, 0xb8, 0x60, 0, 0xd8, 0x68, 0xb0, 0x58, 0x80, 0x30, 0xe8, 0x88, 0x50, 0xe0, 0x38, 0x20, 0xf8, 0x48, 0x90, 0xf0, 0x28, 0x98, 0x40
//};
//
//
//void crcInit(void)
//{
// crc remainder;
//
// fprintf(stderr, "\n");
//
// /*
// * Compute the remainder of each possible dividend.
// */
// for (int dividend = 0; dividend < 256; ++dividend)
// {
// /*
// * Start with the dividend followed by zeros.
// */
// remainder = dividend << (WIDTH - 8);
//
// /*
// * Perform modulo-2 division, a bit at a time.
// */
// for (uint8_t bit = 8; bit > 0; --bit)
// {
// /*
// * Try to divide the current data bit.
// */
// if (remainder & TOPBIT)
// {
// remainder = (remainder << 1) ^ POLYNOMIAL;
// }
// else
// {
// remainder = (remainder << 1);
// }
// }
//
// /*
// * Store the result into the table.
// */
// crcTable[dividend] = remainder;
// fprintf(stderr, "%#x, ", remainder);
// }
//
//} /* crcInit() */
//
//
//crc crcFast(uint8_t const message[], int nBytes)
//{
// uint8_t data;
// crc remainder = 0;
//
//
// /*
// * Divide the message by the polynomial, a byte at a time.
// */
// for (int byte = 0; byte < nBytes; ++byte)
// {
// data = message[byte] ^ (remainder >> (WIDTH - 8));
// remainder = crcTable[data] ^ (remainder << 8);
// }
//
// /*
// * The final remainder is the CRC.
// */
// return (remainder);
//
//} /* crcFast() */
//
//
//crc compute_crc(const char* s)
//{
// crc c;
// //uint8_t cache;
//
// //cache = 15;
//
//// crcInit();
//
// c = crcFast(s, strlen(s));
//
// //fprintf(stderr, "\nlen = %d", strlen(argv[1]));
//
// //fprintf(stderr, "\ncrc = %u\n\n", c);
// //fprintf(stderr, "\ncrc mod 8 = %u\n\n", c%8);
//
// c = c >> 3;
// //fprintf(stderr, "\nshifted crc = %u\n\n", c);
//
// //c = c & cache;
// //c = c % 32;
//
// //fprintf(stderr, "\ncrc = %u\n\n", c);
//
// return (c & 7);
//}
static unsigned char crc8_table[] = {
0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0x95, 0xab, 0xe9, 0xd7,
0x6d, 0x53, 0x11, 0x2f, 0x4f, 0x71, 0x33, 0x0d, 0xb7, 0x89, 0xcb, 0xf5,
0xda, 0xe4, 0xa6, 0x98, 0x22, 0x1c, 0x5e, 0x60, 0x9e, 0xa0, 0xe2, 0xdc,
0x66, 0x58, 0x1a, 0x24, 0x0b, 0x35, 0x77, 0x49, 0xf3, 0xcd, 0x8f, 0xb1,
0xd1, 0xef, 0xad, 0x93, 0x29, 0x17, 0x55, 0x6b, 0x44, 0x7a, 0x38, 0x06,
0xbc, 0x82, 0xc0, 0xfe, 0x59, 0x67, 0x25, 0x1b, 0xa1, 0x9f, 0xdd, 0xe3,
0xcc, 0xf2, 0xb0, 0x8e, 0x34, 0x0a, 0x48, 0x76, 0x16, 0x28, 0x6a, 0x54,
0xee, 0xd0, 0x92, 0xac, 0x83, 0xbd, 0xff, 0xc1, 0x7b, 0x45, 0x07, 0x39,
0xc7, 0xf9, 0xbb, 0x85, 0x3f, 0x01, 0x43, 0x7d, 0x52, 0x6c, 0x2e, 0x10,
0xaa, 0x94, 0xd6, 0xe8, 0x88, 0xb6, 0xf4, 0xca, 0x70, 0x4e, 0x0c, 0x32,
0x1d, 0x23, 0x61, 0x5f, 0xe5, 0xdb, 0x99, 0xa7, 0xb2, 0x8c, 0xce, 0xf0,
0x4a, 0x74, 0x36, 0x08, 0x27, 0x19, 0x5b, 0x65, 0xdf, 0xe1, 0xa3, 0x9d,
0xfd, 0xc3, 0x81, 0xbf, 0x05, 0x3b, 0x79, 0x47, 0x68, 0x56, 0x14, 0x2a,
0x90, 0xae, 0xec, 0xd2, 0x2c, 0x12, 0x50, 0x6e, 0xd4, 0xea, 0xa8, 0x96,
0xb9, 0x87, 0xc5, 0xfb, 0x41, 0x7f, 0x3d, 0x03, 0x63, 0x5d, 0x1f, 0x21,
0x9b, 0xa5, 0xe7, 0xd9, 0xf6, 0xc8, 0x8a, 0xb4, 0x0e, 0x30, 0x72, 0x4c,
0xeb, 0xd5, 0x97, 0xa9, 0x13, 0x2d, 0x6f, 0x51, 0x7e, 0x40, 0x02, 0x3c,
0x86, 0xb8, 0xfa, 0xc4, 0xa4, 0x9a, 0xd8, 0xe6, 0x5c, 0x62, 0x20, 0x1e,
0x31, 0x0f, 0x4d, 0x73, 0xc9, 0xf7, 0xb5, 0x8b, 0x75, 0x4b, 0x09, 0x37,
0x8d, 0xb3, 0xf1, 0xcf, 0xe0, 0xde, 0x9c, 0xa2, 0x18, 0x26, 0x64, 0x5a,
0x3a, 0x04, 0x46, 0x78, 0xc2, 0xfc, 0xbe, 0x80, 0xaf, 0x91, 0xd3, 0xed,
0x57, 0x69, 0x2b, 0x15};
unsigned crc8(unsigned char *data, size_t len)
{
unsigned char *end;
unsigned crc;
crc = 0;
crc ^= 0xff;
end = data + len;
do {
crc = crc8_table[crc ^ *data++];
} while (data < end);
return crc ^ 0xff;
}
crc compute_crc(const char* s)
{
unsigned c;
c = crc8(s, strlen(s));
//fprintf(stderr, "%02x\n", c);
return (c & 7);
}
/**************************************************************************
*
@ -620,21 +471,23 @@ int grow_avl(OBIDMS_avl_p avl) // TODO Lock when needed
int avl_file_descriptor;
char* avl_file_name;
// Get the avl file name
avl_file_name = build_avl_file_name((avl->header)->avl_name);
if (avl_file_name == NULL)
return -1;
avl_file_descriptor = avl->avl_fd;
// Open the avl file
avl_file_descriptor = openat(avl->dir_fd, avl_file_name, O_RDWR);
if (avl_file_descriptor < 0)
{
obi_set_errno(OBI_AVL_ERROR);
obidebug(1, "\nError opening an AVL tree file");
free(avl_file_name);
return -1;
}
free(avl_file_name);
// // Get the avl file name
// avl_file_name = build_avl_file_name((avl->header)->avl_name);
// if (avl_file_name == NULL)
// return -1;
//
// // Open the avl file
// avl_file_descriptor = openat(avl->dir_fd, avl_file_name, O_RDWR);
// if (avl_file_descriptor < 0)
// {
// obi_set_errno(OBI_AVL_ERROR);
// obidebug(1, "\nError opening an AVL tree file");
// free(avl_file_name);
// return -1;
// }
// free(avl_file_name);
// Calculate the new file size
old_data_size = (avl->header)->avl_size;
@ -683,7 +536,7 @@ int grow_avl(OBIDMS_avl_p avl) // TODO Lock when needed
// Set the new avl size
(avl->header)->avl_size = new_data_size;
close(avl_file_descriptor);
//close(avl_file_descriptor);
return 0;
}
@ -698,21 +551,23 @@ int grow_avl_data(OBIDMS_avl_p avl) // TODO Lock when needed
int avl_data_file_descriptor;
char* avl_data_file_name;
// Get the avl data file name
avl_data_file_name = build_avl_data_file_name((avl->header)->avl_name);
if (avl_data_file_name == NULL)
return -1;
avl_data_file_descriptor = avl->data_fd;
// Open the avl data file
avl_data_file_descriptor = openat(avl->dir_fd, avl_data_file_name, O_RDWR);
if (avl_data_file_descriptor < 0)
{
obi_set_errno(OBI_AVL_ERROR);
obidebug(1, "\nError opening an AVL tree data file");
free(avl_data_file_name);
return -1;
}
free(avl_data_file_name);
// // Get the avl data file name
// avl_data_file_name = build_avl_data_file_name((avl->header)->avl_name);
// if (avl_data_file_name == NULL)
// return -1;
//
// // Open the avl data file
// avl_data_file_descriptor = openat(avl->dir_fd, avl_data_file_name, O_RDWR);
// if (avl_data_file_descriptor < 0)
// {
// obi_set_errno(OBI_AVL_ERROR);
// obidebug(1, "\nError opening an AVL tree data file");
// free(avl_data_file_name);
// return -1;
// }
// free(avl_data_file_name);
// Calculate the new file size
old_data_size = ((avl->data)->header)->data_size_max;
@ -763,7 +618,7 @@ int grow_avl_data(OBIDMS_avl_p avl) // TODO Lock when needed
// Initialize new data to 0
memset(((avl->data)->data)+old_data_size, 0, new_data_size - old_data_size);
close(avl_data_file_descriptor);
//close(avl_data_file_descriptor);
return 0;
}
@ -1131,21 +986,71 @@ OBIDMS_avl_p obi_avl(OBIDMS_p dms, const char* avl_name)
}
OBIDMS_avl_p* obi_create_avl_in_64_parts(OBIDMS_p dms, const char* avl_name)
OBIDMS_avl_group_p obi_create_avl_group(OBIDMS_p dms, const char* avl_name)
{
OBIDMS_avl_p* avls;
OBIDMS_avl_group_p avl_group;
char* avl_name_with_idx;
uint8_t i;
avls = (OBIDMS_avl_p*) malloc(64*sizeof(OBIDMS_avl_p));
avl_group = (OBIDMS_avl_group_p) malloc(sizeof(OBIDMS_avl_group_t));
for (i=0; i < 64; i++)
{
asprintf(&avl_name_with_idx,"%s_%u", avl_name, i);
avls[i] = obi_create_avl(dms, avl_name_with_idx);
}
// Create 1st avl
asprintf(&avl_name_with_idx,"%s_%u", avl_name, 0);
(avl_group->sub_avls)[0] = obi_create_avl(dms, avl_name_with_idx);
avl_group->current_avl_idx = 0;
strcpy(avl_group->avl_name, avl_name);
return avls;
avl_group->dms = dms;
return avl_group;
}
int unmap_an_avl(OBIDMS_avl_p avl)
{
if (munmap((avl->data)->data, ((avl->data)->header)->data_size_max) < 0)
return -1;
if (munmap(avl->tree, (((avl->header)->nb_items_max) * sizeof(AVL_node_t))) < 0)
return -1;
return 0;
}
int remap_an_avl(OBIDMS_avl_p avl)
{
(avl->data)->data = mmap(NULL,
((avl->data)->header)->data_size_max,
PROT_READ | PROT_WRITE,
MAP_SHARED,
avl->data_fd,
((avl->data)->header)->header_size);
if ((avl->data)->data == NULL)
return -1;
avl->tree = mmap(NULL,
((avl->header)->nb_items_max) * sizeof(AVL_node_t),
PROT_READ | PROT_WRITE,
MAP_SHARED,
avl->avl_fd,
(avl->header)->header_size);
if (avl->tree == NULL)
return -1;
return 0;
}
int obi_add_new_avl_in_group(OBIDMS_avl_group_p avl_group) // TODO check for errors
{
char* avl_name_with_idx;
// unmap older
unmap_an_avl((avl_group->sub_avls)[avl_group->current_avl_idx]);
(avl_group->current_avl_idx)++;
asprintf(&avl_name_with_idx,"%s_%u", avl_group->avl_name, avl_group->current_avl_idx);
(avl_group->sub_avls)[avl_group->current_avl_idx] = obi_create_avl(avl_group->dms, avl_name_with_idx);
return 0;
}
@ -1251,7 +1156,7 @@ OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name)
// Initialize all bits to 0
memset(avl_data->data, 0, (avl_data->header)->data_size_max);
close(avl_data_file_descriptor);
//close(avl_data_file_descriptor);
// Create the AVL tree file
@ -1351,7 +1256,13 @@ OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name)
(avl->header)->creation_date = time(NULL);
strcpy((avl->header)->avl_name, avl_name);
close(avl_file_descriptor);
avl->avl_fd = avl_file_descriptor;
avl->data_fd = avl_data_file_descriptor;
// Bloom filter
bloom_init(&((avl->header)->bloom_filter), 2000000, 0.001); // TODO use macros
//close(avl_file_descriptor);
// Add in the list of opened AVL trees
*(((dms->opened_avls)->avls)+((dms->opened_avls)->nb_opened_avls)) = avl;
@ -1458,7 +1369,7 @@ OBIDMS_avl_p obi_open_avl(OBIDMS_p dms, const char* avl_name)
return NULL;
}
close(avl_data_file_descriptor);
//close(avl_data_file_descriptor);
// Open the AVL tree file
@ -1544,7 +1455,10 @@ OBIDMS_avl_p obi_open_avl(OBIDMS_p dms, const char* avl_name)
avl->directory = dms->avl_directory;
avl->dir_fd = avl_dir_file_descriptor;
close(avl_file_descriptor);
avl->avl_fd = avl_file_descriptor;
avl->data_fd = avl_data_file_descriptor;
//close(avl_file_descriptor);
// Add in the list of opened AVL trees
*(((dms->opened_avls)->avls)+((dms->opened_avls)->nb_opened_avls)) = avl;
@ -1609,6 +1523,53 @@ byte_t* obi_avl_get(OBIDMS_avl_p avl, index_t idx)
}
int maybe_in_avl(OBIDMS_avl_p avl, byte_t* value)
{
return (bloom_check(&((avl->header)->bloom_filter), value, (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1)))));
}
index_t insert_in_avl_group(OBIDMS_avl_group_p avl_group, byte_t* value) // TODO won't be index_t
{
index_t index_if_already_in;
int i;
if (maybe_in_avl((avl_group->sub_avls)[avl_group->current_avl_idx], value))
{
//fprintf(stderr, "\nyah maybe");
index_if_already_in = obi_avl_find((avl_group->sub_avls)[avl_group->current_avl_idx], value);
if (index_if_already_in >= 0)
return index_if_already_in;
}
// else
// fprintf(stderr, "\nnah");
for (i=0; i < (avl_group->current_avl_idx); i++)
{
if (maybe_in_avl((avl_group->sub_avls)[i], value))
{
//fprintf(stderr, "\nyah maybe");
if (remap_an_avl((avl_group->sub_avls)[i]) < 0)
return -1;
index_if_already_in = obi_avl_find((avl_group->sub_avls)[i], value);
if (unmap_an_avl((avl_group->sub_avls)[i]) < 0)
return -1;
if (index_if_already_in >= 0)
return index_if_already_in;
}
// else
// fprintf(stderr, "\nnah");
}
// not found in any avl: add in current
// first, check if make new one
if ((((avl_group->sub_avls)[avl_group->current_avl_idx])->header)->nb_items == 2000000) // TODO add condition with data size + use macro
obi_add_new_avl_in_group(avl_group);
bloom_add(&((((avl_group->sub_avls)[avl_group->current_avl_idx])->header)->bloom_filter), value, (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1))));
return obi_avl_add((avl_group->sub_avls)[avl_group->current_avl_idx], value);
}
// Insert a new node
index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value)
{
@ -1674,7 +1635,7 @@ index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value)
// Value already stored
{
//fprintf(stderr, "\n>>>ALREADY IN, %s, %lld\n", obi_obibytes_to_seq(value), (avl->header)->nb_items);
return current_node->value;
return current_node->value; // TODO should trigger error if using bloom filters
}
depth++;
@ -1732,7 +1693,7 @@ index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value)
}
// Find if a value is already in an AVL tree
// Find if a value is already in an AVL tree TODO use bloom
index_t obi_avl_find(OBIDMS_avl_p avl, byte_t* value)
{
int comp;