aboutsummaryrefslogtreecommitdiff
path: root/src/lib/container/bloomfilt.c
blob: ace3f6786e18ffc997a4bd7acd533ebc253c9f41 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/* Copyright (c) 2003-2004, Roger Dingledine
 * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
 * Copyright (c) 2007-2021, The Tor Project, Inc. */
/* See LICENSE for licensing information */

/**
 * \file bloomfilt.c
 * \brief Uses bitarray_t to implement a bloom filter.
 **/

#include <stdlib.h>

#include "lib/malloc/malloc.h"
#include "lib/container/bloomfilt.h"
#include "lib/intmath/bits.h"
#include "lib/log/util_bug.h"
#include "ext/siphash.h"

/** How many bloom-filter bits we set per address. This is twice the
 * BLOOMFILT_N_HASHES value, since we split the siphash output into two 32-bit
 * values. */
#define N_BITS_PER_ITEM (BLOOMFILT_N_HASHES * 2)

struct bloomfilt_t {
  /** siphash keys to make BLOOMFILT_N_HASHES independent hashes for each
   * items. */
  struct sipkey key[BLOOMFILT_N_HASHES];
  bloomfilt_hash_fn hashfn; /**< Function used to generate hashes */
  uint32_t mask; /**< One less than the number of bits in <b>ba</b>; always
                  * one less than a power of two. */
  bitarray_t *ba; /**< A bit array to implement the Bloom filter. */
};

#define BIT(set, n) ((n) & (set)->mask)

/** Add the element <b>item</b> to <b>set</b>. */
void
bloomfilt_add(bloomfilt_t *set,
              const void *item)
{
  int i;
  for (i = 0; i < BLOOMFILT_N_HASHES; ++i) {
    uint64_t h = set->hashfn(&set->key[i], item);
    uint32_t high_bits = (uint32_t)(h >> 32);
    uint32_t low_bits = (uint32_t)(h);
    bitarray_set(set->ba, BIT(set, high_bits));
    bitarray_set(set->ba, BIT(set, low_bits));
  }
}

/** If <b>item</b> is in <b>set</b>, return nonzero.  Otherwise,
 * <em>probably</em> return zero. */
int
bloomfilt_probably_contains(const bloomfilt_t *set,
                            const void *item)
{
  int i, matches = 0;
  for (i = 0; i < BLOOMFILT_N_HASHES; ++i) {
    uint64_t h = set->hashfn(&set->key[i], item);
    uint32_t high_bits = (uint32_t)(h >> 32);
    uint32_t low_bits = (uint32_t)(h);
    // Note that !! is necessary here, since bitarray_is_set does not
    // necessarily return 1 on true.
    matches += !! bitarray_is_set(set->ba, BIT(set, high_bits));
    matches += !! bitarray_is_set(set->ba, BIT(set, low_bits));
  }
  return matches == N_BITS_PER_ITEM;
}

/** Return a newly allocated bloomfilt_t, optimized to hold a total of
 * <b>max_elements</b> elements with a reasonably low false positive weight.
 *
 * Uses the siphash-based function <b>hashfn</b> to compute hard-to-collide
 * functions of the items, and the key material <b>random_key</b> to
 * key the hash.  There must be BLOOMFILT_KEY_LEN bytes in the supplied key.
 **/
bloomfilt_t *
bloomfilt_new(int max_elements,
              bloomfilt_hash_fn hashfn,
              const uint8_t *random_key)
{
  /* The probability of false positives is about P=(1 - exp(-kn/m))^k, where k
   * is the number of hash functions per entry, m is the bits in the array,
   * and n is the number of elements inserted.  For us, k==4, n<=max_elements,
   * and m==n_bits= approximately max_elements*32.  This gives
   *   P<(1-exp(-4*n/(32*n)))^4 == (1-exp(1/-8))^4 == .00019
   *
   * It would be more optimal in space vs false positives to get this false
   * positive rate by going for k==13, and m==18.5n, but we also want to
   * conserve CPU, and k==13 is pretty big.
   */
  int n_bits = 1u << (tor_log2(max_elements)+5);
  bloomfilt_t *r = tor_malloc(sizeof(bloomfilt_t));
  r->mask = n_bits - 1;
  r->ba = bitarray_init_zero(n_bits);

  tor_assert(sizeof(r->key) == BLOOMFILT_KEY_LEN);
  memcpy(r->key, random_key, sizeof(r->key));

  r->hashfn = hashfn;

  return r;
}

/** Free all storage held in <b>set</b>. */
void
bloomfilt_free_(bloomfilt_t *set)
{
  if (!set)
    return;
  bitarray_free(set->ba);
  tor_free(set);
}