Garima Singh | f52207a | 2020-03-30 00:31:24 +0000 | [diff] [blame] | 1 | #ifndef BLOOM_H |
| 2 | #define BLOOM_H |
| 3 | |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 4 | struct commit; |
| 5 | struct repository; |
| 6 | |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 7 | struct bloom_filter_settings { |
| 8 | /* |
| 9 | * The version of the hashing technique being used. |
| 10 | * We currently only support version = 1 which is |
| 11 | * the seeded murmur3 hashing technique implemented |
| 12 | * in bloom.c. |
| 13 | */ |
| 14 | uint32_t hash_version; |
| 15 | |
| 16 | /* |
| 17 | * The number of times a path is hashed, i.e. the |
| 18 | * number of bit positions tht cumulatively |
| 19 | * determine whether a path is present in the |
| 20 | * Bloom filter. |
| 21 | */ |
| 22 | uint32_t num_hashes; |
| 23 | |
| 24 | /* |
| 25 | * The minimum number of bits per entry in the Bloom |
| 26 | * filter. If the filter contains 'n' entries, then |
| 27 | * filter size is the minimum number of 8-bit words |
| 28 | * that contain n*b bits. |
| 29 | */ |
| 30 | uint32_t bits_per_entry; |
Taylor Blau | 97ffa4f | 2020-09-17 09:34:42 -0400 | [diff] [blame] | 31 | |
| 32 | /* |
| 33 | * The maximum number of changed paths per commit |
| 34 | * before declaring a Bloom filter to be too-large. |
| 35 | * |
| 36 | * Not written to the commit-graph file. |
| 37 | */ |
| 38 | uint32_t max_changed_paths; |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 39 | }; |
| 40 | |
Taylor Blau | 97ffa4f | 2020-09-17 09:34:42 -0400 | [diff] [blame] | 41 | #define DEFAULT_BLOOM_MAX_CHANGES 512 |
| 42 | #define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10, DEFAULT_BLOOM_MAX_CHANGES } |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 43 | #define BITS_PER_WORD 8 |
Garima Singh | 1217c03 | 2020-04-06 16:59:50 +0000 | [diff] [blame] | 44 | #define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t) |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 45 | |
| 46 | /* |
| 47 | * A bloom_filter struct represents a data segment to |
| 48 | * use when testing hash values. The 'len' member |
| 49 | * dictates how many entries are stored in |
| 50 | * 'data'. |
| 51 | */ |
| 52 | struct bloom_filter { |
| 53 | unsigned char *data; |
| 54 | size_t len; |
| 55 | }; |
| 56 | |
| 57 | /* |
| 58 | * A bloom_key represents the k hash values for a |
| 59 | * given string. These can be precomputed and |
| 60 | * stored in a bloom_key for re-use when testing |
| 61 | * against a bloom_filter. The number of hashes is |
| 62 | * given by the Bloom filter settings and is the same |
| 63 | * for all Bloom filters and keys interacting with |
| 64 | * the loaded version of the commit graph file and |
| 65 | * the Bloom data chunks. |
| 66 | */ |
| 67 | struct bloom_key { |
| 68 | uint32_t *hashes; |
| 69 | }; |
| 70 | |
Garima Singh | f52207a | 2020-03-30 00:31:24 +0000 | [diff] [blame] | 71 | /* |
| 72 | * Calculate the murmur3 32-bit hash value for the given data |
| 73 | * using the given seed. |
| 74 | * Produces a uniformly distributed hash value. |
| 75 | * Not considered to be cryptographically secure. |
| 76 | * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm |
| 77 | */ |
| 78 | uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len); |
| 79 | |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 80 | void fill_bloom_key(const char *data, |
| 81 | size_t len, |
| 82 | struct bloom_key *key, |
| 83 | const struct bloom_filter_settings *settings); |
Derrick Stolee | f32dde8 | 2020-05-11 11:56:19 +0000 | [diff] [blame] | 84 | void clear_bloom_key(struct bloom_key *key); |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 85 | |
| 86 | void add_key_to_filter(const struct bloom_key *key, |
Derrick Stolee | eb591e4 | 2020-05-01 15:30:18 +0000 | [diff] [blame] | 87 | struct bloom_filter *filter, |
| 88 | const struct bloom_filter_settings *settings); |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 89 | |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 90 | void init_bloom_filters(void); |
| 91 | |
Taylor Blau | 312cff5 | 2020-09-16 14:07:32 -0400 | [diff] [blame] | 92 | enum bloom_filter_computed { |
| 93 | BLOOM_NOT_COMPUTED = (1 << 0), |
| 94 | BLOOM_COMPUTED = (1 << 1), |
| 95 | BLOOM_TRUNC_LARGE = (1 << 2), |
Taylor Blau | 59f0d50 | 2020-09-17 22:59:44 -0400 | [diff] [blame] | 96 | BLOOM_TRUNC_EMPTY = (1 << 3), |
Taylor Blau | 312cff5 | 2020-09-16 14:07:32 -0400 | [diff] [blame] | 97 | }; |
| 98 | |
| 99 | struct bloom_filter *get_or_compute_bloom_filter(struct repository *r, |
| 100 | struct commit *c, |
| 101 | int compute_if_not_present, |
Taylor Blau | 9a7a9ed | 2020-09-16 14:07:46 -0400 | [diff] [blame] | 102 | const struct bloom_filter_settings *settings, |
Taylor Blau | 312cff5 | 2020-09-16 14:07:32 -0400 | [diff] [blame] | 103 | enum bloom_filter_computed *computed); |
| 104 | |
| 105 | #define get_bloom_filter(r, c) get_or_compute_bloom_filter( \ |
Taylor Blau | 9a7a9ed | 2020-09-16 14:07:46 -0400 | [diff] [blame] | 106 | (r), (c), 0, NULL, NULL) |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 107 | |
Garima Singh | a56b946 | 2020-04-06 16:59:52 +0000 | [diff] [blame] | 108 | int bloom_filter_contains(const struct bloom_filter *filter, |
| 109 | const struct bloom_key *key, |
| 110 | const struct bloom_filter_settings *settings); |
| 111 | |
Đoàn Trần Công Danh | 066b70a | 2020-05-08 00:51:02 +0100 | [diff] [blame] | 112 | #endif |