Garima Singh | f52207a | 2020-03-30 00:31:24 +0000 | [diff] [blame] | 1 | #include "git-compat-util.h" |
| 2 | #include "bloom.h" |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 3 | #include "diff.h" |
| 4 | #include "diffcore.h" |
| 5 | #include "revision.h" |
| 6 | #include "hashmap.h" |
Garima Singh | 1217c03 | 2020-04-06 16:59:50 +0000 | [diff] [blame] | 7 | #include "commit-graph.h" |
| 8 | #include "commit.h" |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 9 | |
| 10 | define_commit_slab(bloom_filter_slab, struct bloom_filter); |
| 11 | |
Đoàn Trần Công Danh | 066b70a | 2020-05-08 00:51:02 +0100 | [diff] [blame] | 12 | static struct bloom_filter_slab bloom_filters; |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 13 | |
| 14 | struct pathmap_hash_entry { |
| 15 | struct hashmap_entry entry; |
| 16 | const char path[FLEX_ARRAY]; |
| 17 | }; |
Garima Singh | f52207a | 2020-03-30 00:31:24 +0000 | [diff] [blame] | 18 | |
| 19 | static uint32_t rotate_left(uint32_t value, int32_t count) |
| 20 | { |
| 21 | uint32_t mask = 8 * sizeof(uint32_t) - 1; |
| 22 | count &= mask; |
| 23 | return ((value << count) | (value >> ((-count) & mask))); |
| 24 | } |
| 25 | |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 26 | static inline unsigned char get_bitmask(uint32_t pos) |
| 27 | { |
| 28 | return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1)); |
| 29 | } |
| 30 | |
Garima Singh | 1217c03 | 2020-04-06 16:59:50 +0000 | [diff] [blame] | 31 | static int load_bloom_filter_from_graph(struct commit_graph *g, |
Derrick Stolee | eb591e4 | 2020-05-01 15:30:18 +0000 | [diff] [blame] | 32 | struct bloom_filter *filter, |
Taylor Blau | 9550f6c | 2022-07-12 19:10:33 -0400 | [diff] [blame] | 33 | uint32_t graph_pos) |
Garima Singh | 1217c03 | 2020-04-06 16:59:50 +0000 | [diff] [blame] | 34 | { |
| 35 | uint32_t lex_pos, start_index, end_index; |
| 36 | |
Abhishek Kumar | c752ad0 | 2020-06-17 14:44:11 +0530 | [diff] [blame] | 37 | while (graph_pos < g->num_commits_in_base) |
Garima Singh | 1217c03 | 2020-04-06 16:59:50 +0000 | [diff] [blame] | 38 | g = g->base_graph; |
| 39 | |
Taylor Blau | 4f36440 | 2020-09-09 11:22:44 -0400 | [diff] [blame] | 40 | /* The commit graph commit 'c' lives in doesn't carry Bloom filters. */ |
Garima Singh | 1217c03 | 2020-04-06 16:59:50 +0000 | [diff] [blame] | 41 | if (!g->chunk_bloom_indexes) |
| 42 | return 0; |
| 43 | |
Abhishek Kumar | c752ad0 | 2020-06-17 14:44:11 +0530 | [diff] [blame] | 44 | lex_pos = graph_pos - g->num_commits_in_base; |
Garima Singh | 1217c03 | 2020-04-06 16:59:50 +0000 | [diff] [blame] | 45 | |
| 46 | end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos); |
| 47 | |
| 48 | if (lex_pos > 0) |
| 49 | start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1)); |
| 50 | else |
| 51 | start_index = 0; |
| 52 | |
| 53 | filter->len = end_index - start_index; |
| 54 | filter->data = (unsigned char *)(g->chunk_bloom_data + |
| 55 | sizeof(unsigned char) * start_index + |
| 56 | BLOOMDATA_CHUNK_HEADER_SIZE); |
| 57 | |
| 58 | return 1; |
| 59 | } |
| 60 | |
Garima Singh | f52207a | 2020-03-30 00:31:24 +0000 | [diff] [blame] | 61 | /* |
| 62 | * Calculate the murmur3 32-bit hash value for the given data |
| 63 | * using the given seed. |
| 64 | * Produces a uniformly distributed hash value. |
| 65 | * Not considered to be cryptographically secure. |
| 66 | * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm |
| 67 | */ |
| 68 | uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len) |
| 69 | { |
| 70 | const uint32_t c1 = 0xcc9e2d51; |
| 71 | const uint32_t c2 = 0x1b873593; |
| 72 | const uint32_t r1 = 15; |
| 73 | const uint32_t r2 = 13; |
| 74 | const uint32_t m = 5; |
| 75 | const uint32_t n = 0xe6546b64; |
| 76 | int i; |
| 77 | uint32_t k1 = 0; |
| 78 | const char *tail; |
| 79 | |
| 80 | int len4 = len / sizeof(uint32_t); |
| 81 | |
| 82 | uint32_t k; |
| 83 | for (i = 0; i < len4; i++) { |
| 84 | uint32_t byte1 = (uint32_t)data[4*i]; |
| 85 | uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8; |
| 86 | uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16; |
| 87 | uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24; |
| 88 | k = byte1 | byte2 | byte3 | byte4; |
| 89 | k *= c1; |
| 90 | k = rotate_left(k, r1); |
| 91 | k *= c2; |
| 92 | |
| 93 | seed ^= k; |
| 94 | seed = rotate_left(seed, r2) * m + n; |
| 95 | } |
| 96 | |
| 97 | tail = (data + len4 * sizeof(uint32_t)); |
| 98 | |
| 99 | switch (len & (sizeof(uint32_t) - 1)) { |
| 100 | case 3: |
| 101 | k1 ^= ((uint32_t)tail[2]) << 16; |
| 102 | /*-fallthrough*/ |
| 103 | case 2: |
| 104 | k1 ^= ((uint32_t)tail[1]) << 8; |
| 105 | /*-fallthrough*/ |
| 106 | case 1: |
| 107 | k1 ^= ((uint32_t)tail[0]) << 0; |
| 108 | k1 *= c1; |
| 109 | k1 = rotate_left(k1, r1); |
| 110 | k1 *= c2; |
| 111 | seed ^= k1; |
| 112 | break; |
| 113 | } |
| 114 | |
| 115 | seed ^= (uint32_t)len; |
| 116 | seed ^= (seed >> 16); |
| 117 | seed *= 0x85ebca6b; |
| 118 | seed ^= (seed >> 13); |
| 119 | seed *= 0xc2b2ae35; |
| 120 | seed ^= (seed >> 16); |
| 121 | |
| 122 | return seed; |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 123 | } |
| 124 | |
| 125 | void fill_bloom_key(const char *data, |
Derrick Stolee | eb591e4 | 2020-05-01 15:30:18 +0000 | [diff] [blame] | 126 | size_t len, |
| 127 | struct bloom_key *key, |
| 128 | const struct bloom_filter_settings *settings) |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 129 | { |
| 130 | int i; |
| 131 | const uint32_t seed0 = 0x293ae76f; |
| 132 | const uint32_t seed1 = 0x7e646e2c; |
| 133 | const uint32_t hash0 = murmur3_seeded(seed0, data, len); |
| 134 | const uint32_t hash1 = murmur3_seeded(seed1, data, len); |
| 135 | |
| 136 | key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t)); |
| 137 | for (i = 0; i < settings->num_hashes; i++) |
| 138 | key->hashes[i] = hash0 + i * hash1; |
| 139 | } |
| 140 | |
Derrick Stolee | f32dde8 | 2020-05-11 11:56:19 +0000 | [diff] [blame] | 141 | void clear_bloom_key(struct bloom_key *key) |
| 142 | { |
| 143 | FREE_AND_NULL(key->hashes); |
| 144 | } |
| 145 | |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 146 | void add_key_to_filter(const struct bloom_key *key, |
Derrick Stolee | eb591e4 | 2020-05-01 15:30:18 +0000 | [diff] [blame] | 147 | struct bloom_filter *filter, |
| 148 | const struct bloom_filter_settings *settings) |
Garima Singh | f1294ea | 2020-03-30 00:31:25 +0000 | [diff] [blame] | 149 | { |
| 150 | int i; |
| 151 | uint64_t mod = filter->len * BITS_PER_WORD; |
| 152 | |
| 153 | for (i = 0; i < settings->num_hashes; i++) { |
| 154 | uint64_t hash_mod = key->hashes[i] % mod; |
| 155 | uint64_t block_pos = hash_mod / BITS_PER_WORD; |
| 156 | |
| 157 | filter->data[block_pos] |= get_bitmask(hash_mod); |
| 158 | } |
| 159 | } |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 160 | |
| 161 | void init_bloom_filters(void) |
| 162 | { |
| 163 | init_bloom_filter_slab(&bloom_filters); |
| 164 | } |
| 165 | |
Ævar Arnfjörð Bjarmason | 5cf88fd | 2022-08-25 19:09:48 +0200 | [diff] [blame] | 166 | static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED, |
Derrick Stolee | 65c1a28 | 2020-05-11 11:56:12 +0000 | [diff] [blame] | 167 | const struct hashmap_entry *eptr, |
| 168 | const struct hashmap_entry *entry_or_key, |
Ævar Arnfjörð Bjarmason | 5cf88fd | 2022-08-25 19:09:48 +0200 | [diff] [blame] | 169 | const void *keydata UNUSED) |
Derrick Stolee | 65c1a28 | 2020-05-11 11:56:12 +0000 | [diff] [blame] | 170 | { |
| 171 | const struct pathmap_hash_entry *e1, *e2; |
| 172 | |
| 173 | e1 = container_of(eptr, const struct pathmap_hash_entry, entry); |
| 174 | e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry); |
| 175 | |
| 176 | return strcmp(e1->path, e2->path); |
| 177 | } |
| 178 | |
Taylor Blau | 59f0d50 | 2020-09-17 22:59:44 -0400 | [diff] [blame] | 179 | static void init_truncated_large_filter(struct bloom_filter *filter) |
| 180 | { |
| 181 | filter->data = xmalloc(1); |
| 182 | filter->data[0] = 0xFF; |
| 183 | filter->len = 1; |
| 184 | } |
| 185 | |
Taylor Blau | 312cff5 | 2020-09-16 14:07:32 -0400 | [diff] [blame] | 186 | struct bloom_filter *get_or_compute_bloom_filter(struct repository *r, |
| 187 | struct commit *c, |
| 188 | int compute_if_not_present, |
Taylor Blau | 9a7a9ed | 2020-09-16 14:07:46 -0400 | [diff] [blame] | 189 | const struct bloom_filter_settings *settings, |
Taylor Blau | 312cff5 | 2020-09-16 14:07:32 -0400 | [diff] [blame] | 190 | enum bloom_filter_computed *computed) |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 191 | { |
| 192 | struct bloom_filter *filter; |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 193 | int i; |
| 194 | struct diff_options diffopt; |
| 195 | |
Taylor Blau | 312cff5 | 2020-09-16 14:07:32 -0400 | [diff] [blame] | 196 | if (computed) |
| 197 | *computed = BLOOM_NOT_COMPUTED; |
| 198 | |
Derrick Stolee | 9491974 | 2020-07-01 13:27:23 +0000 | [diff] [blame] | 199 | if (!bloom_filters.slab_size) |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 200 | return NULL; |
| 201 | |
| 202 | filter = bloom_filter_slab_at(&bloom_filters, c); |
| 203 | |
Garima Singh | 1217c03 | 2020-04-06 16:59:50 +0000 | [diff] [blame] | 204 | if (!filter->data) { |
Taylor Blau | 9550f6c | 2022-07-12 19:10:33 -0400 | [diff] [blame] | 205 | uint32_t graph_pos; |
| 206 | if (repo_find_commit_pos_in_graph(r, c, &graph_pos)) |
| 207 | load_bloom_filter_from_graph(r->objects->commit_graph, |
| 208 | filter, graph_pos); |
Garima Singh | 1217c03 | 2020-04-06 16:59:50 +0000 | [diff] [blame] | 209 | } |
| 210 | |
Taylor Blau | 809e032 | 2020-09-18 09:27:27 -0400 | [diff] [blame] | 211 | if (filter->data && filter->len) |
Garima Singh | 1217c03 | 2020-04-06 16:59:50 +0000 | [diff] [blame] | 212 | return filter; |
Derrick Stolee | 9491974 | 2020-07-01 13:27:23 +0000 | [diff] [blame] | 213 | if (!compute_if_not_present) |
| 214 | return NULL; |
Garima Singh | 1217c03 | 2020-04-06 16:59:50 +0000 | [diff] [blame] | 215 | |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 216 | repo_diff_setup(r, &diffopt); |
| 217 | diffopt.flags.recursive = 1; |
Derrick Stolee | caf388c | 2020-04-09 13:00:11 +0000 | [diff] [blame] | 218 | diffopt.detect_rename = 0; |
Taylor Blau | 9a7a9ed | 2020-09-16 14:07:46 -0400 | [diff] [blame] | 219 | diffopt.max_changes = settings->max_changed_paths; |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 220 | diff_setup_done(&diffopt); |
| 221 | |
Derrick Stolee | 891c17c | 2020-05-11 11:56:10 +0000 | [diff] [blame] | 222 | /* ensure commit is parsed so we have parent information */ |
| 223 | repo_parse_commit(r, c); |
| 224 | |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 225 | if (c->parents) |
| 226 | diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt); |
| 227 | else |
| 228 | diff_tree_oid(NULL, &c->object.oid, "", &diffopt); |
| 229 | diffcore_std(&diffopt); |
| 230 | |
Derrick Stolee | b16a827 | 2020-09-16 14:07:52 -0400 | [diff] [blame] | 231 | if (diff_queued_diff.nr <= settings->max_changed_paths) { |
Elijah Newren | b19315d | 2020-11-11 20:02:20 +0000 | [diff] [blame] | 232 | struct hashmap pathmap = HASHMAP_INIT(pathmap_cmp, NULL); |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 233 | struct pathmap_hash_entry *e; |
| 234 | struct hashmap_iter iter; |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 235 | |
| 236 | for (i = 0; i < diff_queued_diff.nr; i++) { |
| 237 | const char *path = diff_queued_diff.queue[i]->two->path; |
| 238 | |
| 239 | /* |
Derrick Stolee | 65c1a28 | 2020-05-11 11:56:12 +0000 | [diff] [blame] | 240 | * Add each leading directory of the changed file, i.e. for |
| 241 | * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so |
| 242 | * the Bloom filter could be used to speed up commands like |
| 243 | * 'git log dir/subdir', too. |
| 244 | * |
| 245 | * Note that directories are added without the trailing '/'. |
| 246 | */ |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 247 | do { |
| 248 | char *last_slash = strrchr(path, '/'); |
| 249 | |
| 250 | FLEX_ALLOC_STR(e, path, path); |
| 251 | hashmap_entry_init(&e->entry, strhash(path)); |
Derrick Stolee | 65c1a28 | 2020-05-11 11:56:12 +0000 | [diff] [blame] | 252 | |
| 253 | if (!hashmap_get(&pathmap, &e->entry, NULL)) |
| 254 | hashmap_add(&pathmap, &e->entry); |
| 255 | else |
| 256 | free(e); |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 257 | |
| 258 | if (!last_slash) |
| 259 | last_slash = (char*)path; |
| 260 | *last_slash = '\0'; |
| 261 | |
| 262 | } while (*path); |
| 263 | |
| 264 | diff_free_filepair(diff_queued_diff.queue[i]); |
| 265 | } |
| 266 | |
Derrick Stolee | b16a827 | 2020-09-16 14:07:52 -0400 | [diff] [blame] | 267 | if (hashmap_get_size(&pathmap) > settings->max_changed_paths) { |
Taylor Blau | 59f0d50 | 2020-09-17 22:59:44 -0400 | [diff] [blame] | 268 | init_truncated_large_filter(filter); |
Derrick Stolee | b16a827 | 2020-09-16 14:07:52 -0400 | [diff] [blame] | 269 | if (computed) |
| 270 | *computed |= BLOOM_TRUNC_LARGE; |
| 271 | goto cleanup; |
| 272 | } |
| 273 | |
Taylor Blau | 9a7a9ed | 2020-09-16 14:07:46 -0400 | [diff] [blame] | 274 | filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD; |
Taylor Blau | 59f0d50 | 2020-09-17 22:59:44 -0400 | [diff] [blame] | 275 | if (!filter->len) { |
| 276 | if (computed) |
| 277 | *computed |= BLOOM_TRUNC_EMPTY; |
| 278 | filter->len = 1; |
| 279 | } |
René Scharfe | ca56dad | 2021-03-13 17:17:22 +0100 | [diff] [blame] | 280 | CALLOC_ARRAY(filter->data, filter->len); |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 281 | |
| 282 | hashmap_for_each_entry(&pathmap, &iter, e, entry) { |
| 283 | struct bloom_key key; |
Taylor Blau | 9a7a9ed | 2020-09-16 14:07:46 -0400 | [diff] [blame] | 284 | fill_bloom_key(e->path, strlen(e->path), &key, settings); |
| 285 | add_key_to_filter(&key, filter, settings); |
Andrzej Hunt | b180c68 | 2021-04-25 14:16:11 +0000 | [diff] [blame] | 286 | clear_bloom_key(&key); |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 287 | } |
| 288 | |
Derrick Stolee | b16a827 | 2020-09-16 14:07:52 -0400 | [diff] [blame] | 289 | cleanup: |
Elijah Newren | 6da1a25 | 2020-11-02 18:55:05 +0000 | [diff] [blame] | 290 | hashmap_clear_and_free(&pathmap, struct pathmap_hash_entry, entry); |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 291 | } else { |
| 292 | for (i = 0; i < diff_queued_diff.nr; i++) |
| 293 | diff_free_filepair(diff_queued_diff.queue[i]); |
Taylor Blau | 59f0d50 | 2020-09-17 22:59:44 -0400 | [diff] [blame] | 294 | init_truncated_large_filter(filter); |
Taylor Blau | 312cff5 | 2020-09-16 14:07:32 -0400 | [diff] [blame] | 295 | |
| 296 | if (computed) |
| 297 | *computed |= BLOOM_TRUNC_LARGE; |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 298 | } |
| 299 | |
Taylor Blau | 312cff5 | 2020-09-16 14:07:32 -0400 | [diff] [blame] | 300 | if (computed) |
| 301 | *computed |= BLOOM_COMPUTED; |
| 302 | |
Garima Singh | ed591fe | 2020-03-30 00:31:26 +0000 | [diff] [blame] | 303 | free(diff_queued_diff.queue); |
| 304 | DIFF_QUEUE_CLEAR(&diff_queued_diff); |
| 305 | |
| 306 | return filter; |
| 307 | } |
Garima Singh | a56b946 | 2020-04-06 16:59:52 +0000 | [diff] [blame] | 308 | |
| 309 | int bloom_filter_contains(const struct bloom_filter *filter, |
| 310 | const struct bloom_key *key, |
| 311 | const struct bloom_filter_settings *settings) |
| 312 | { |
| 313 | int i; |
| 314 | uint64_t mod = filter->len * BITS_PER_WORD; |
| 315 | |
| 316 | if (!mod) |
| 317 | return -1; |
| 318 | |
| 319 | for (i = 0; i < settings->num_hashes; i++) { |
| 320 | uint64_t hash_mod = key->hashes[i] % mod; |
| 321 | uint64_t block_pos = hash_mod / BITS_PER_WORD; |
| 322 | if (!(filter->data[block_pos] & get_bitmask(hash_mod))) |
| 323 | return 0; |
| 324 | } |
| 325 | |
| 326 | return 1; |
Đoàn Trần Công Danh | 066b70a | 2020-05-08 00:51:02 +0100 | [diff] [blame] | 327 | } |