211 lines
5.9 KiB
C
Executable File
211 lines
5.9 KiB
C
Executable File
/*
|
|
* Copyright (c) 2012-2015, Jyri J. Virkki
|
|
* All rights reserved.
|
|
*
|
|
* This file is under BSD license. See LICENSE file.
|
|
*/
|
|
|
|
#ifndef _BLOOM_H
|
|
#define _BLOOM_H
|
|
|
|
|
|
/** ***************************************************************************
|
|
* Added by celine.mercier@metabarcoding.org
|
|
*
|
|
* Bloom filter error rate wanted.
|
|
*
|
|
*/
|
|
#define BLOOM_FILTER_ERROR_RATE (0.001)
|
|
|
|
|
|
/** ***************************************************************************
|
|
* On Linux, the code attempts to compute a bucket size based on CPU cache
|
|
* size info, if available. If that fails for any reason, this fallback size
|
|
* is used instead.
|
|
*
|
|
* On non-Linux systems, this is the bucket size always used unless the
|
|
* caller overrides it (see bloom_init_size()).
|
|
*
|
|
*/
|
|
#define BLOOM_BUCKET_SIZE_FALLBACK (32 * 1024)
|
|
|
|
|
|
/** ***************************************************************************
|
|
* It was found that using multiplier x0.5 for CPU L1 cache size is
|
|
* more effective in terms of CPU usage and, surprisingly, collisions
|
|
* number.
|
|
*
|
|
* Feel free to tune this constant the way it will work for you.
|
|
*
|
|
*/
|
|
#define BLOOM_L1_CACHE_SIZE_DIV 1
|
|
|
|
|
|
/** ***************************************************************************
|
|
* Structure to keep track of one bloom filter. Caller needs to
|
|
* allocate this and pass it to the functions below. First call for
|
|
* every struct must be to bloom_init().
|
|
*
|
|
*/
|
|
struct bloom
|
|
{
|
|
// These fields are part of the public interface of this structure.
|
|
// Client code may read these values if desired. Client code MUST NOT
|
|
// modify any of these.
|
|
int entries;
|
|
double error;
|
|
int bits;
|
|
int bytes;
|
|
int hashes;
|
|
|
|
// Fields below are private to the implementation. These may go away or
|
|
// change incompatibly at any moment. Client code MUST NOT access or rely
|
|
// on these.
|
|
unsigned buckets;
|
|
unsigned bucket_bytes;
|
|
|
|
// x86 CPU divide by/multiply by operation optimization helpers
|
|
unsigned bucket_bytes_exponent;
|
|
unsigned bucket_bits_fast_mod_operand;
|
|
|
|
double bpe;
|
|
int ready;
|
|
|
|
unsigned char bf[];
|
|
};
|
|
|
|
typedef struct bloom bloom_t;
|
|
|
|
|
|
/** ***************************************************************************
|
|
* Added by celine.mercier@metabarcoding.org
|
|
*
|
|
* This function computes the size needed by the bloom filter
|
|
* depending on the number of entries and the error rate.
|
|
*
|
|
*/
|
|
int bloom_filter_size(int entries, double error);
|
|
|
|
|
|
/** ***************************************************************************
|
|
* Initialize the bloom filter for use.
|
|
*
|
|
* The filter is initialized with a bit field and number of hash functions
|
|
* according to the computations from the wikipedia entry:
|
|
* http://en.wikipedia.org/wiki/Bloom_filter
|
|
*
|
|
* Optimal number of bits is:
|
|
* bits = (entries * ln(error)) / ln(2)^2
|
|
*
|
|
* Optimal number of hash functions is:
|
|
* hashes = bpe * ln(2)
|
|
*
|
|
* Parameters:
|
|
* -----------
|
|
* bloom - Pointer to an allocated struct bloom (see above).
|
|
* entries - The expected number of entries which will be inserted.
|
|
* error - Probability of collision (as long as entries are not
|
|
* exceeded).
|
|
*
|
|
* Return:
|
|
* -------
|
|
* 0 - on success
|
|
* 1 - on failure
|
|
*
|
|
*/
|
|
int bloom_init(struct bloom * bloom, int entries); //, double error);
|
|
|
|
|
|
/** ***************************************************************************
|
|
* Initialize the bloom filter for use.
|
|
*
|
|
* See comments above for general information.
|
|
*
|
|
* This is the same as bloom_init() but allows the caller to pass in a
|
|
* cache_size to override the internal value (which is either computed
|
|
* or the default of BLOOM_BUCKET_SIZE_FALLBACK). Mostly useful for
|
|
* experimenting.
|
|
*
|
|
* See misc/bucketsize for a script which can help identify a good value
|
|
* for cache_size.
|
|
*
|
|
*/
|
|
int bloom_init_size(struct bloom * bloom, int entries, double error,
|
|
unsigned int cache_size);
|
|
|
|
|
|
/** ***************************************************************************
|
|
* Check if the given element is in the bloom filter. Remember this may
|
|
* return false positive if a collision occurred.
|
|
*
|
|
* Parameters:
|
|
* -----------
|
|
* bloom - Pointer to an allocated struct bloom (see above).
|
|
* buffer - Pointer to buffer containing element to check.
|
|
* len - Size of 'buffer'.
|
|
*
|
|
* Return:
|
|
* -------
|
|
* 0 - element is not present
|
|
* 1 - element is present (or false positive due to collision)
|
|
* -1 - bloom not initialized
|
|
*
|
|
*/
|
|
int bloom_check(struct bloom * bloom, const void * buffer, int len);
|
|
|
|
|
|
/** ***************************************************************************
|
|
* Add the given element to the bloom filter.
|
|
* The return code indicates if the element (or a collision) was already in,
|
|
* so for the common check+add use case, no need to call check separately.
|
|
*
|
|
* Parameters:
|
|
* -----------
|
|
* bloom - Pointer to an allocated struct bloom (see above).
|
|
* buffer - Pointer to buffer containing element to add.
|
|
* len - Size of 'buffer'.
|
|
*
|
|
* Return:
|
|
* -------
|
|
* 0 - element was not present and was added
|
|
* 1 - element (or a collision) had already been added previously
|
|
* -1 - bloom not initialized
|
|
*
|
|
*/
|
|
int bloom_add(struct bloom * bloom, const void * buffer, int len);
|
|
|
|
|
|
/** ***************************************************************************
|
|
* Print (to stdout) info about this bloom filter. Debugging aid.
|
|
*
|
|
*/
|
|
void bloom_print(struct bloom * bloom);
|
|
|
|
|
|
/** ***************************************************************************
|
|
* Deallocate internal storage.
|
|
*
|
|
* Upon return, the bloom struct is no longer usable. You may call bloom_init
|
|
* again on the same struct to reinitialize it again.
|
|
*
|
|
* Parameters:
|
|
* -----------
|
|
* bloom - Pointer to an allocated struct bloom (see above).
|
|
*
|
|
* Return: none
|
|
*
|
|
*/
|
|
void bloom_free(struct bloom * bloom);
|
|
|
|
|
|
/** ***************************************************************************
|
|
* Returns version string compiled into library.
|
|
*
|
|
* Return: version string
|
|
*
|
|
*/
|
|
const char * bloom_version();
|
|
|
|
|
|
#endif
|