1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
/* Copyright (c) 2003-2004, Roger Dingledine
* Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
* Copyright (c) 2007-2021, The Tor Project, Inc. */
/* See LICENSE for licensing information */
/**
* \file bloomfilt.c
* \brief Uses bitarray_t to implement a bloom filter.
**/
#include <stdlib.h>
#include "lib/malloc/malloc.h"
#include "lib/container/bloomfilt.h"
#include "lib/intmath/bits.h"
#include "lib/log/util_bug.h"
#include "ext/siphash.h"
/** How many bloom-filter bits we set per address. This is twice the
* BLOOMFILT_N_HASHES value, since we split the siphash output into two 32-bit
* values. */
#define N_BITS_PER_ITEM (BLOOMFILT_N_HASHES * 2)
struct bloomfilt_t {
/** siphash keys to make BLOOMFILT_N_HASHES independent hashes for each
* items. */
struct sipkey key[BLOOMFILT_N_HASHES];
bloomfilt_hash_fn hashfn; /**< Function used to generate hashes */
uint32_t mask; /**< One less than the number of bits in <b>ba</b>; always
* one less than a power of two. */
bitarray_t *ba; /**< A bit array to implement the Bloom filter. */
};
#define BIT(set, n) ((n) & (set)->mask)
/** Add the element <b>item</b> to <b>set</b>. */
void
bloomfilt_add(bloomfilt_t *set,
const void *item)
{
int i;
for (i = 0; i < BLOOMFILT_N_HASHES; ++i) {
uint64_t h = set->hashfn(&set->key[i], item);
uint32_t high_bits = (uint32_t)(h >> 32);
uint32_t low_bits = (uint32_t)(h);
bitarray_set(set->ba, BIT(set, high_bits));
bitarray_set(set->ba, BIT(set, low_bits));
}
}
/** If <b>item</b> is in <b>set</b>, return nonzero. Otherwise,
* <em>probably</em> return zero. */
int
bloomfilt_probably_contains(const bloomfilt_t *set,
const void *item)
{
int i, matches = 0;
for (i = 0; i < BLOOMFILT_N_HASHES; ++i) {
uint64_t h = set->hashfn(&set->key[i], item);
uint32_t high_bits = (uint32_t)(h >> 32);
uint32_t low_bits = (uint32_t)(h);
// Note that !! is necessary here, since bitarray_is_set does not
// necessarily return 1 on true.
matches += !! bitarray_is_set(set->ba, BIT(set, high_bits));
matches += !! bitarray_is_set(set->ba, BIT(set, low_bits));
}
return matches == N_BITS_PER_ITEM;
}
/** Return a newly allocated bloomfilt_t, optimized to hold a total of
* <b>max_elements</b> elements with a reasonably low false positive weight.
*
* Uses the siphash-based function <b>hashfn</b> to compute hard-to-collide
* functions of the items, and the key material <b>random_key</b> to
* key the hash. There must be BLOOMFILT_KEY_LEN bytes in the supplied key.
**/
bloomfilt_t *
bloomfilt_new(int max_elements,
bloomfilt_hash_fn hashfn,
const uint8_t *random_key)
{
/* The probability of false positives is about P=(1 - exp(-kn/m))^k, where k
* is the number of hash functions per entry, m is the bits in the array,
* and n is the number of elements inserted. For us, k==4, n<=max_elements,
* and m==n_bits= approximately max_elements*32. This gives
* P<(1-exp(-4*n/(32*n)))^4 == (1-exp(1/-8))^4 == .00019
*
* It would be more optimal in space vs false positives to get this false
* positive rate by going for k==13, and m==18.5n, but we also want to
* conserve CPU, and k==13 is pretty big.
*/
int n_bits = 1u << (tor_log2(max_elements)+5);
bloomfilt_t *r = tor_malloc(sizeof(bloomfilt_t));
r->mask = n_bits - 1;
r->ba = bitarray_init_zero(n_bits);
tor_assert(sizeof(r->key) == BLOOMFILT_KEY_LEN);
memcpy(r->key, random_key, sizeof(r->key));
r->hashfn = hashfn;
return r;
}
/** Free all storage held in <b>set</b>. */
void
bloomfilt_free_(bloomfilt_t *set)
{
if (!set)
return;
bitarray_free(set->ba);
tor_free(set);
}
|