initial bloom filter import
parent
93e470eb80
commit
5a45d15894
@ 0,0 +1,5 @@


INCLUDES = I../include I$(RIOTBASE)/core/include


MODULE = bloom




include $(RIOTBASE)/Makefile.base



@ 0,0 +1,267 @@


/******************************************************************************


* bloom.c


* ```````


* Bloom filters


*


* HISTORY


* {x, y, z}


* A Bloom filter is a probibalistic : : :


* data structure with several interesting /\ /\ /\


* properties, such as low memory usage, /  X  X  \


* asymmetric query confidence, and a very / / \/ \ \


* speedy O(k) membership test. /   \ \


* / / /\ \ \


* Because a Bloom filter can . . . . . . . . .


* accept any input that can be 00000000001000101010101010100010000000000


* hashed effectively (such as " " "


* strings), that membership test \  /


* tends to draw a crowd. TNSTAAFL, but \  /


* as caveats go, the Bloom filters' are \  /


* more interesting than incapacitating. \/


* :


* Most notably, it can tell you with certainty {w}


* that an item 'i' is *not* a member of set 's',


* but it can only tell you with some finite


* probability whether an item 'i' *is* a member


* of set 's'.


*


* Still, along with the intriguing possibility of using bitwise AND and OR


* to compute the logical union and intersection of two filters, the cheap


* cost of adding elements to the filter set, and the low memory requirements,


* the Bloom filter is a good choice for many applications.


*


* NOTES


*


* Let's look more closely at the probability values.


*


* Assume that a hash function selects each array position with equal


* probability. If m is the number of bits in the array, and k is the number


* of hash functions, then the probability that a certain bit is not set


* to 1 by a certain hash function during the insertion of an element is


*


* 1(1/m).


*


* The probability that it is not set to 1 by any of the hash functions is


*


* (1(1/m))^k.


*


* If we have inserted n elements, the probability that a certain bit is


* set 0 is


*


* (1(1/m))^kn,


*


* Meaning that the probability said bit is set to 1 is therefore


*


* 1([1(1/m)]^kn).


*


* Now test membership of an element that is not in the set. Each of the k


* array positions computed by the hash functions is 1 with a probability


* as above. The probability of all of them being 1, which would cause the


* algorithm to erroneously claim that the element is in the set, is often


* given as


*


* (1[1(1/m)]^kn)^k ~~ (1  e^(kn/m))^k.


*


* This is not strictly correct as it assumes independence for the


* probabilities of each bit being set. However, assuming it is a close


* approximation we have that the probability of false positives descreases


* as m (the number of bits in the array) increases, and increases as n


* (the number of inserted elements) increases. For a given m and n, the


* value of k (the number of hash functions) that minimizes the probability


* is


*


* (m/n)ln(2) ~~ 0.7(m/n),


*


* which gives the false positive probability of


*


* 2^k ~~ 0.6185^(m/n).


*


* The required number of bits m, given n and a desired false positive


* probability p (and assuming the optimal value of k is used) can be


* computed by substituting the optimal value of k in the probability


* expression above:


*


* p = (1  e^((((m/n)ln(2))*(n/m))))^((m/n)ln(2)),


*


* which simplifies to


*


* ln(p) = (m/n) * (ln2)^2.


*


* This results in the equation


*


* m = ((n*ln(p)) / ((ln(2))^2))


*


* The classic filter uses


*


* 1.44*log2(1/eta)


*


* bits of space per inserted key, where eta is the false positive rate of


* the Bloom filter.


*


* AUTHOR


* Jason Linehan (patientulysses@gmail.com)


*


* LICENSE


* Public domain.


*


******************************************************************************/




#include <limits.h>


#include <stdarg.h>


#include <stdbool.h>




#include "bloom.h"






#define SETBIT(a,n) (a[n/CHAR_BIT] = (1<<(n%CHAR_BIT)))


#define GETBIT(a,n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT)))


#define ROUND(size) ((size + CHAR_BIT  1) / CHAR_BIT)






/******************************************************************************


* bloom_new Allocate and return a pointer to a new Bloom filter.


* `````````


* @size : size of the bit array in the filter


* @nfuncs: the number of hash functions


* Returns: An allocated bloom filter


*


* USAGE


* For best results, make 'size' a power of 2.


*


******************************************************************************/


struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...) {


struct bloom_t *bloom;


va_list hashes;


int n;




/* Allocate Bloom filter container */


if (!(bloom = malloc(sizeof(struct bloom_t)))) {


return NULL;


}




/* Allocate Bloom array */


if (!(bloom>a = calloc(ROUND(size), sizeof(char)))) {


free(bloom);


return NULL;


}




/* Allocate Bloom filter hash function pointers */


if (!(bloom>hash = (hashfp_t *)malloc(num_hashes *sizeof(hashfp_t)))) {


free(bloom>a);


free(bloom);


return NULL;


}




/* Assign hash functions to pointers in the Bloom filter */


va_start(hashes, num_hashes);




for (n = 0; n < num_hashes; n++) {


bloom>hash[n] = va_arg(hashes, hashfp_t);


}




va_end(hashes);




/*


* Record the number of hash functions (k) and the number of bytes


* in the Bloom array (m).


*/


bloom>k = num_hashes;


bloom>m = size;




return bloom;


}






/******************************************************************************


* bloom_del Delete a Bloom filter.


* `````````


* @bloom : The condemned.


* Returns: nothing.


*


******************************************************************************/


void bloom_del(struct bloom_t *bloom)


{


free(bloom>a);


free(bloom>hash);


free(bloom);


}






/******************************************************************************


* bloom_add Add a string to a Bloom filter.


* `````````


* @bloom : Bloom filter


* @s : string to add


* Returns: nothing.


*


* CAVEAT


* Once a string has been added to the filter, it cannot be "removed"!


*


******************************************************************************/


void bloom_add(struct bloom_t *bloom, const char *s)


{


unsigned int hash;


int n;




for (n = 0; n < bloom>k; n++) {


hash = (unsigned int)bloom>hash[n](s);


SETBIT(bloom>a, (hash % bloom>m));


}


}






/******************************************************************************


* bloom_check Determine if a string is in the Bloom filter.


* ```````````


* @bloom : Bloom filter


* @s : string to add


* Returns: false if string does not exist in the filter, otherwise true.


*


* NOTES


*


* So this is the freakshow that bored programmers pay a nickel to get a


* peek at, step right up. This is the way the membership test works.


*


* The string 's' is hashed once for each of the 'k' hash functions, as


* though we were planning to add it to the filter. Instead of adding it


* however, we examine the bit that we *would* have set, and consider its


* value.


*


* If the bit is 1 (set), the string we are hashing may be in the filter,


* since it would have set this bit when it was originally hashed. However,


* it may also be that another string just happened to produce a hash value


* that would also set this bit. That would be a false positive. This is why


* we have k > 1, so we can minimize the likelihood of false positives


* occuring.


*


* If every bit corresponding to every one of the k hashes of our query


* string is set, we can say with some probability of being correct that


* the string we are holding is indeed "in" the filter. However, we can


* never be sure.


*


* If, however, as we hash our string and peek at the resulting bit in the


* filter, we find the bit is 0 (not set)... well now, that's different.


* In this case, we can say with absolute certainty that the string we are


* holding is *not* in the filter, because if it were, this bit would have


* to be set.


*


* In this way, the Bloom filter can answer NO with absolute surety, but


* can only speak a qualified YES.


*


******************************************************************************/


bool bloom_check(struct bloom_t *bloom, const char *s)


{


unsigned int hash;


int n;




for (n = 0; n < bloom>k; n++) {


hash = (unsigned int)bloom>hash[n](s);




if (!(GETBIT(bloom>a, (hash % bloom>m)))) {


return false;


}


}




return true; /* ? */


}



@ 0,0 +1,22 @@


#ifndef _BLOOM_FILTER_H


#define _BLOOM_FILTER_H




#include <stdlib.h>


#include <stdbool.h>


#include <stdint.h>




typedef unsigned int (*hashfp_t)(const char *);




struct bloom_t {


size_t m;


size_t k;


unsigned char *a;


hashfp_t *hash;


};




struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...);


void bloom_del(struct bloom_t *bloom);


void bloom_add(struct bloom_t *bloom, const char *s);


bool bloom_check(struct bloom_t *bloom, const char *s);




#endif

@ 0,0 +1,164 @@


/******************************************************************************


* djb2_hash


* `````````


* HISTORY


* This algorithm (k=33) was first reported by Dan Bernstein many years


* ago in comp.lang.c. Another version of this algorithm (now favored by


* bernstein) uses XOR:


*


* hash(i) = hash(i  1) * 33 ^ str[i];


*


* The magic of number 33 (why it works better than many other constants,


* prime or not) has never been adequately explained.


*


******************************************************************************/


static inline unsigned long djb2_hash(const char *str)


{


unsigned long hash;


int c;




hash = 5381;




while ((c = (unsigned char) * str++)) {


hash = ((hash << 5) + hash) + c; /* hash * 33 + c */


}




return hash;


}




/******************************************************************************


* sdbm_hash


* `````````


* HISTORY


* This algorithm was created for sdbm (a publicdomain reimplementation


* of ndbm) database library. It was found to do well in scrambling bits,


* causing better distribution of the keys and fewer splits. it also


* happens to be a good general hashing function with good distribution.


*


* The actual function is


*


* hash(i) = hash(i  1) * 65599 + str[i];


*


* What is included below is the faster version used in gawk. [there is


* even a faster, duffdevice version] the magic constant 65599 was picked


* out of thin air while experimenting with different constants, and turns


* out to be a prime. this is one of the algorithms used in berkeley db


* (see sleepycat) and elsewhere.


*


******************************************************************************/


static inline unsigned long sdbm_hash(const char *str)


{


unsigned long hash;


int c;




hash = 0;




while ((c = (unsigned char) * str++)) {


hash = c + (hash << 6) + (hash << 16)  hash;


}




return hash;


}




/******************************************************************************


* lose lose


* `````````


* HISTORY


* This hash function appeared in K&R (1st ed) but at least the reader


* was warned:


*


* "This is not the best possible algorithm, but it has the merit


* of extreme simplicity."


*


* This is an understatement. It is a terrible hashing algorithm, and it


* could have been much better without sacrificing its "extreme simplicity."


* [see the second edition!]


*


* Many C programmers use this function without actually testing it, or


* checking something like Knuth's Sorting and Searching, so it stuck.


* It is now found mixed with otherwise respectable code, eg. cnews. sigh.


* [see also: tpop]


*


******************************************************************************/


static inline unsigned long kr_hash(const char *str)


{


unsigned int hash;


unsigned int c;




hash = 0;




while ((c = (unsigned char) * str++)) {


hash += c;


}




return hash;


}




/******************************************************************************


* sax_hash


* ````````


* Shift, Add, XOR


*


******************************************************************************/


static inline unsigned int sax_hash(const char *key)


{


unsigned int h;




h = 0;




while (*key) {


h ^= (h << 5) + (h >> 2) + (unsigned char) * key++;


}




return h;


}






/******************************************************************************


* dek_hash


* ````````


* HISTORY


* Proposed by Donald E. Knuth in The Art Of Computer Programming Vol. 3,


* under the topic of "Sorting and Search", Chapter 6.4.


*


******************************************************************************/


static inline unsigned int dek_hash(const char *str, unsigned int len)


{


unsigned int hash;


unsigned int c;




hash = len;


c = 0;




while ((c = (unsigned int) * str++)) {


hash = ((hash << 5) ^ (hash >> 27)) ^ (c);


}




return hash;


}






/******************************************************************************


* fnv_hash


* ````````


* NOTE


* For a more fully featured and modern version of this hash, see fnv32.c


*


******************************************************************************/


static inline unsigned int fnv_hash(const char *str)


{


#define FNV_PRIME 0x811C9DC5


unsigned int hash;


unsigned int c;




hash = 0;


c = 0;




while ((c = (unsigned int) * str++)) {


hash *= FNV_PRIME;


hash ^= (c);


}




return hash;


}



Loading…
Reference in New Issue