Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 1 | #include "cache.h" |
| 2 | #include "pack-revindex.h" |
Stefan Beller | a80d72d | 2018-03-23 18:20:59 +0100 | [diff] [blame] | 3 | #include "object-store.h" |
Jeff King | 4828ce9 | 2019-04-05 14:04:24 -0400 | [diff] [blame] | 4 | #include "packfile.h" |
Taylor Blau | ec8e776 | 2021-01-25 18:37:46 -0500 | [diff] [blame] | 5 | #include "config.h" |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 6 | |
Taylor Blau | d5bc7c6 | 2021-01-13 17:25:06 -0500 | [diff] [blame] | 7 | struct revindex_entry { |
| 8 | off_t offset; |
| 9 | unsigned int nr; |
| 10 | }; |
| 11 | |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 12 | /* |
| 13 | * Pack index for existing packs give us easy access to the offsets into |
| 14 | * corresponding pack file where each object's data starts, but the entries |
| 15 | * do not store the size of the compressed representation (uncompressed |
| 16 | * size is easily available by examining the pack entry header). It is |
| 17 | * also rather expensive to find the sha1 for an object given its offset. |
| 18 | * |
Jeff King | f401533 | 2015-12-21 01:19:49 -0500 | [diff] [blame] | 19 | * The pack index file is sorted by object name mapping to offset; |
| 20 | * this revindex array is a list of offset/index_nr pairs |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 21 | * ordered by offset, so if you know the offset of an object, next offset |
| 22 | * is where its packed representation ends and the index_nr can be used to |
| 23 | * get the object sha1 from the main index. |
| 24 | */ |
| 25 | |
Jeff King | 8b8dfd5 | 2013-07-11 08:16:00 -0400 | [diff] [blame] | 26 | /* |
| 27 | * This is a least-significant-digit radix sort. |
| 28 | * |
| 29 | * It sorts each of the "n" items in "entries" by its offset field. The "max" |
| 30 | * parameter must be at least as large as the largest offset in the array, |
| 31 | * and lets us quit the sort early. |
| 32 | */ |
| 33 | static void sort_revindex(struct revindex_entry *entries, unsigned n, off_t max) |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 34 | { |
Jeff King | 8b8dfd5 | 2013-07-11 08:16:00 -0400 | [diff] [blame] | 35 | /* |
| 36 | * We use a "digit" size of 16 bits. That keeps our memory |
| 37 | * usage reasonable, and we can generally (for a 4G or smaller |
| 38 | * packfile) quit after two rounds of radix-sorting. |
| 39 | */ |
| 40 | #define DIGIT_SIZE (16) |
| 41 | #define BUCKETS (1 << DIGIT_SIZE) |
| 42 | /* |
| 43 | * We want to know the bucket that a[i] will go into when we are using |
| 44 | * the digit that is N bits from the (least significant) end. |
| 45 | */ |
| 46 | #define BUCKET_FOR(a, i, bits) (((a)[(i)].offset >> (bits)) & (BUCKETS-1)) |
| 47 | |
| 48 | /* |
| 49 | * We need O(n) temporary storage. Rather than do an extra copy of the |
| 50 | * partial results into "entries", we sort back and forth between the |
| 51 | * real array and temporary storage. In each iteration of the loop, we |
| 52 | * keep track of them with alias pointers, always sorting from "from" |
| 53 | * to "to". |
| 54 | */ |
Jeff King | b32fa95 | 2016-02-22 17:44:25 -0500 | [diff] [blame] | 55 | struct revindex_entry *tmp, *from, *to; |
Jeff King | 8b8dfd5 | 2013-07-11 08:16:00 -0400 | [diff] [blame] | 56 | int bits; |
Jeff King | b32fa95 | 2016-02-22 17:44:25 -0500 | [diff] [blame] | 57 | unsigned *pos; |
| 58 | |
| 59 | ALLOC_ARRAY(pos, BUCKETS); |
| 60 | ALLOC_ARRAY(tmp, n); |
| 61 | from = entries; |
| 62 | to = tmp; |
Jeff King | 8b8dfd5 | 2013-07-11 08:16:00 -0400 | [diff] [blame] | 63 | |
| 64 | /* |
| 65 | * If (max >> bits) is zero, then we know that the radix digit we are |
| 66 | * on (and any higher) will be zero for all entries, and our loop will |
| 67 | * be a no-op, as everybody lands in the same zero-th bucket. |
| 68 | */ |
| 69 | for (bits = 0; max >> bits; bits += DIGIT_SIZE) { |
Jeff King | 8b8dfd5 | 2013-07-11 08:16:00 -0400 | [diff] [blame] | 70 | unsigned i; |
| 71 | |
| 72 | memset(pos, 0, BUCKETS * sizeof(*pos)); |
| 73 | |
| 74 | /* |
| 75 | * We want pos[i] to store the index of the last element that |
| 76 | * will go in bucket "i" (actually one past the last element). |
| 77 | * To do this, we first count the items that will go in each |
| 78 | * bucket, which gives us a relative offset from the last |
| 79 | * bucket. We can then cumulatively add the index from the |
| 80 | * previous bucket to get the true index. |
| 81 | */ |
| 82 | for (i = 0; i < n; i++) |
| 83 | pos[BUCKET_FOR(from, i, bits)]++; |
| 84 | for (i = 1; i < BUCKETS; i++) |
| 85 | pos[i] += pos[i-1]; |
| 86 | |
| 87 | /* |
| 88 | * Now we can drop the elements into their correct buckets (in |
| 89 | * our temporary array). We iterate the pos counter backwards |
| 90 | * to avoid using an extra index to count up. And since we are |
| 91 | * going backwards there, we must also go backwards through the |
| 92 | * array itself, to keep the sort stable. |
| 93 | * |
| 94 | * Note that we use an unsigned iterator to make sure we can |
| 95 | * handle 2^32-1 objects, even on a 32-bit system. But this |
| 96 | * means we cannot use the more obvious "i >= 0" loop condition |
| 97 | * for counting backwards, and must instead check for |
| 98 | * wrap-around with UINT_MAX. |
| 99 | */ |
| 100 | for (i = n - 1; i != UINT_MAX; i--) |
| 101 | to[--pos[BUCKET_FOR(from, i, bits)]] = from[i]; |
| 102 | |
| 103 | /* |
| 104 | * Now "to" contains the most sorted list, so we swap "from" and |
| 105 | * "to" for the next iteration. |
| 106 | */ |
René Scharfe | 35d803b | 2017-01-28 22:40:58 +0100 | [diff] [blame] | 107 | SWAP(from, to); |
Jeff King | 8b8dfd5 | 2013-07-11 08:16:00 -0400 | [diff] [blame] | 108 | } |
| 109 | |
| 110 | /* |
| 111 | * If we ended with our data in the original array, great. If not, |
| 112 | * we have to move it back from the temporary storage. |
| 113 | */ |
| 114 | if (from != entries) |
René Scharfe | 45ccef8 | 2016-09-25 09:24:03 +0200 | [diff] [blame] | 115 | COPY_ARRAY(entries, tmp, n); |
Jeff King | 8b8dfd5 | 2013-07-11 08:16:00 -0400 | [diff] [blame] | 116 | free(tmp); |
| 117 | free(pos); |
| 118 | |
| 119 | #undef BUCKET_FOR |
| 120 | #undef BUCKETS |
| 121 | #undef DIGIT_SIZE |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 122 | } |
| 123 | |
| 124 | /* |
| 125 | * Ordered list of offsets of objects in the pack. |
| 126 | */ |
Jeff King | 9d98bbf | 2015-12-21 01:20:33 -0500 | [diff] [blame] | 127 | static void create_pack_revindex(struct packed_git *p) |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 128 | { |
Shahzad Lone | 33de80b | 2019-02-02 23:16:45 +0000 | [diff] [blame] | 129 | const unsigned num_ent = p->num_objects; |
Jeff King | 012b32b | 2013-07-10 07:50:26 -0400 | [diff] [blame] | 130 | unsigned i; |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 131 | const char *index = p->index_data; |
brian m. carlson | fa13080 | 2018-10-15 00:01:53 +0000 | [diff] [blame] | 132 | const unsigned hashsz = the_hash_algo->rawsz; |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 133 | |
Junio C Hamano | 11529ec | 2016-02-26 13:37:16 -0800 | [diff] [blame] | 134 | ALLOC_ARRAY(p->revindex, num_ent + 1); |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 135 | index += 4 * 256; |
| 136 | |
| 137 | if (p->index_version > 1) { |
| 138 | const uint32_t *off_32 = |
Jeff King | f86f769 | 2020-11-13 00:06:48 -0500 | [diff] [blame] | 139 | (uint32_t *)(index + 8 + (size_t)p->num_objects * (hashsz + 4)); |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 140 | const uint32_t *off_64 = off_32 + p->num_objects; |
| 141 | for (i = 0; i < num_ent; i++) { |
Shahzad Lone | 33de80b | 2019-02-02 23:16:45 +0000 | [diff] [blame] | 142 | const uint32_t off = ntohl(*off_32++); |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 143 | if (!(off & 0x80000000)) { |
Jeff King | 9d98bbf | 2015-12-21 01:20:33 -0500 | [diff] [blame] | 144 | p->revindex[i].offset = off; |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 145 | } else { |
Derrick Stolee | ad622a2 | 2018-01-17 14:08:23 -0500 | [diff] [blame] | 146 | p->revindex[i].offset = get_be64(off_64); |
| 147 | off_64 += 2; |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 148 | } |
Jeff King | 9d98bbf | 2015-12-21 01:20:33 -0500 | [diff] [blame] | 149 | p->revindex[i].nr = i; |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 150 | } |
| 151 | } else { |
| 152 | for (i = 0; i < num_ent; i++) { |
Shahzad Lone | 33de80b | 2019-02-02 23:16:45 +0000 | [diff] [blame] | 153 | const uint32_t hl = *((uint32_t *)(index + (hashsz + 4) * i)); |
Jeff King | 9d98bbf | 2015-12-21 01:20:33 -0500 | [diff] [blame] | 154 | p->revindex[i].offset = ntohl(hl); |
| 155 | p->revindex[i].nr = i; |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 156 | } |
| 157 | } |
| 158 | |
brian m. carlson | fa13080 | 2018-10-15 00:01:53 +0000 | [diff] [blame] | 159 | /* |
| 160 | * This knows the pack format -- the hash trailer |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 161 | * follows immediately after the last object data. |
| 162 | */ |
brian m. carlson | fa13080 | 2018-10-15 00:01:53 +0000 | [diff] [blame] | 163 | p->revindex[num_ent].offset = p->pack_size - hashsz; |
Jeff King | 9d98bbf | 2015-12-21 01:20:33 -0500 | [diff] [blame] | 164 | p->revindex[num_ent].nr = -1; |
| 165 | sort_revindex(p->revindex, num_ent, p->pack_size); |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 166 | } |
| 167 | |
Taylor Blau | 2f4ba2a | 2021-01-25 18:37:14 -0500 | [diff] [blame] | 168 | static int create_pack_revindex_in_memory(struct packed_git *p) |
| 169 | { |
Taylor Blau | ec8e776 | 2021-01-25 18:37:46 -0500 | [diff] [blame] | 170 | if (git_env_bool(GIT_TEST_REV_INDEX_DIE_IN_MEMORY, 0)) |
| 171 | die("dying as requested by '%s'", |
| 172 | GIT_TEST_REV_INDEX_DIE_IN_MEMORY); |
Taylor Blau | 2f4ba2a | 2021-01-25 18:37:14 -0500 | [diff] [blame] | 173 | if (open_pack_index(p)) |
| 174 | return -1; |
| 175 | create_pack_revindex(p); |
| 176 | return 0; |
| 177 | } |
| 178 | |
| 179 | static char *pack_revindex_filename(struct packed_git *p) |
| 180 | { |
| 181 | size_t len; |
| 182 | if (!strip_suffix(p->pack_name, ".pack", &len)) |
| 183 | BUG("pack_name does not end in .pack"); |
| 184 | return xstrfmt("%.*s.rev", (int)len, p->pack_name); |
| 185 | } |
| 186 | |
| 187 | #define RIDX_HEADER_SIZE (12) |
| 188 | #define RIDX_MIN_SIZE (RIDX_HEADER_SIZE + (2 * the_hash_algo->rawsz)) |
| 189 | |
| 190 | struct revindex_header { |
| 191 | uint32_t signature; |
| 192 | uint32_t version; |
| 193 | uint32_t hash_id; |
| 194 | }; |
| 195 | |
| 196 | static int load_revindex_from_disk(char *revindex_name, |
| 197 | uint32_t num_objects, |
| 198 | const uint32_t **data_p, size_t *len_p) |
| 199 | { |
| 200 | int fd, ret = 0; |
| 201 | struct stat st; |
| 202 | void *data = NULL; |
| 203 | size_t revindex_size; |
| 204 | struct revindex_header *hdr; |
| 205 | |
| 206 | fd = git_open(revindex_name); |
| 207 | |
| 208 | if (fd < 0) { |
| 209 | ret = -1; |
| 210 | goto cleanup; |
| 211 | } |
| 212 | if (fstat(fd, &st)) { |
| 213 | ret = error_errno(_("failed to read %s"), revindex_name); |
| 214 | goto cleanup; |
| 215 | } |
| 216 | |
| 217 | revindex_size = xsize_t(st.st_size); |
| 218 | |
| 219 | if (revindex_size < RIDX_MIN_SIZE) { |
| 220 | ret = error(_("reverse-index file %s is too small"), revindex_name); |
| 221 | goto cleanup; |
| 222 | } |
| 223 | |
| 224 | if (revindex_size - RIDX_MIN_SIZE != st_mult(sizeof(uint32_t), num_objects)) { |
| 225 | ret = error(_("reverse-index file %s is corrupt"), revindex_name); |
| 226 | goto cleanup; |
| 227 | } |
| 228 | |
| 229 | data = xmmap(NULL, revindex_size, PROT_READ, MAP_PRIVATE, fd, 0); |
| 230 | hdr = data; |
| 231 | |
| 232 | if (ntohl(hdr->signature) != RIDX_SIGNATURE) { |
| 233 | ret = error(_("reverse-index file %s has unknown signature"), revindex_name); |
| 234 | goto cleanup; |
| 235 | } |
| 236 | if (ntohl(hdr->version) != 1) { |
| 237 | ret = error(_("reverse-index file %s has unsupported version %"PRIu32), |
| 238 | revindex_name, ntohl(hdr->version)); |
| 239 | goto cleanup; |
| 240 | } |
| 241 | if (!(ntohl(hdr->hash_id) == 1 || ntohl(hdr->hash_id) == 2)) { |
| 242 | ret = error(_("reverse-index file %s has unsupported hash id %"PRIu32), |
| 243 | revindex_name, ntohl(hdr->hash_id)); |
| 244 | goto cleanup; |
| 245 | } |
| 246 | |
| 247 | cleanup: |
| 248 | if (ret) { |
| 249 | if (data) |
| 250 | munmap(data, revindex_size); |
| 251 | } else { |
| 252 | *len_p = revindex_size; |
| 253 | *data_p = (const uint32_t *)data; |
| 254 | } |
| 255 | |
Taylor Blau | 66f52fa | 2021-02-26 11:31:02 -0500 | [diff] [blame] | 256 | if (fd >= 0) |
| 257 | close(fd); |
Taylor Blau | 2f4ba2a | 2021-01-25 18:37:14 -0500 | [diff] [blame] | 258 | return ret; |
| 259 | } |
| 260 | |
| 261 | static int load_pack_revindex_from_disk(struct packed_git *p) |
| 262 | { |
| 263 | char *revindex_name; |
| 264 | int ret; |
| 265 | if (open_pack_index(p)) |
| 266 | return -1; |
| 267 | |
| 268 | revindex_name = pack_revindex_filename(p); |
| 269 | |
| 270 | ret = load_revindex_from_disk(revindex_name, |
| 271 | p->num_objects, |
| 272 | &p->revindex_map, |
| 273 | &p->revindex_size); |
| 274 | if (ret) |
| 275 | goto cleanup; |
| 276 | |
| 277 | p->revindex_data = (const uint32_t *)((const char *)p->revindex_map + RIDX_HEADER_SIZE); |
| 278 | |
| 279 | cleanup: |
| 280 | free(revindex_name); |
| 281 | return ret; |
| 282 | } |
| 283 | |
Jeff King | 4828ce9 | 2019-04-05 14:04:24 -0400 | [diff] [blame] | 284 | int load_pack_revindex(struct packed_git *p) |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 285 | { |
Taylor Blau | 2f4ba2a | 2021-01-25 18:37:14 -0500 | [diff] [blame] | 286 | if (p->revindex || p->revindex_data) |
| 287 | return 0; |
| 288 | |
| 289 | if (!load_pack_revindex_from_disk(p)) |
| 290 | return 0; |
| 291 | else if (!create_pack_revindex_in_memory(p)) |
| 292 | return 0; |
| 293 | return -1; |
Vicent Marti | 92e5c77 | 2013-10-24 14:00:36 -0400 | [diff] [blame] | 294 | } |
| 295 | |
Taylor Blau | 8389855 | 2021-01-13 17:25:02 -0500 | [diff] [blame] | 296 | int offset_to_pack_pos(struct packed_git *p, off_t ofs, uint32_t *pos) |
Vicent Marti | 92e5c77 | 2013-10-24 14:00:36 -0400 | [diff] [blame] | 297 | { |
Taylor Blau | 8389855 | 2021-01-13 17:25:02 -0500 | [diff] [blame] | 298 | unsigned lo, hi; |
Taylor Blau | 8389855 | 2021-01-13 17:25:02 -0500 | [diff] [blame] | 299 | |
| 300 | if (load_pack_revindex(p) < 0) |
| 301 | return -1; |
| 302 | |
| 303 | lo = 0; |
| 304 | hi = p->num_objects + 1; |
Vicent Marti | 92e5c77 | 2013-10-24 14:00:36 -0400 | [diff] [blame] | 305 | |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 306 | do { |
Shahzad Lone | 33de80b | 2019-02-02 23:16:45 +0000 | [diff] [blame] | 307 | const unsigned mi = lo + (hi - lo) / 2; |
Taylor Blau | e5dcd78 | 2021-01-13 17:25:10 -0500 | [diff] [blame] | 308 | off_t got = pack_pos_to_offset(p, mi); |
| 309 | |
| 310 | if (got == ofs) { |
Taylor Blau | 8389855 | 2021-01-13 17:25:02 -0500 | [diff] [blame] | 311 | *pos = mi; |
| 312 | return 0; |
Taylor Blau | e5dcd78 | 2021-01-13 17:25:10 -0500 | [diff] [blame] | 313 | } else if (ofs < got) |
Nicolas Pitre | 3449f8c | 2008-02-28 00:25:17 -0500 | [diff] [blame] | 314 | hi = mi; |
| 315 | else |
| 316 | lo = mi + 1; |
| 317 | } while (lo < hi); |
Vicent Marti | 92e5c77 | 2013-10-24 14:00:36 -0400 | [diff] [blame] | 318 | |
Nicolas Pitre | 08698b1 | 2008-10-29 19:02:49 -0400 | [diff] [blame] | 319 | error("bad offset for revindex"); |
Vicent Marti | 92e5c77 | 2013-10-24 14:00:36 -0400 | [diff] [blame] | 320 | return -1; |
| 321 | } |
| 322 | |
Taylor Blau | f33fb6e | 2021-01-13 17:23:31 -0500 | [diff] [blame] | 323 | uint32_t pack_pos_to_index(struct packed_git *p, uint32_t pos) |
| 324 | { |
Taylor Blau | 2f4ba2a | 2021-01-25 18:37:14 -0500 | [diff] [blame] | 325 | if (!(p->revindex || p->revindex_data)) |
Taylor Blau | f33fb6e | 2021-01-13 17:23:31 -0500 | [diff] [blame] | 326 | BUG("pack_pos_to_index: reverse index not yet loaded"); |
| 327 | if (p->num_objects <= pos) |
| 328 | BUG("pack_pos_to_index: out-of-bounds object at %"PRIu32, pos); |
Taylor Blau | 2f4ba2a | 2021-01-25 18:37:14 -0500 | [diff] [blame] | 329 | |
| 330 | if (p->revindex) |
| 331 | return p->revindex[pos].nr; |
| 332 | else |
| 333 | return get_be32(p->revindex_data + pos); |
Taylor Blau | f33fb6e | 2021-01-13 17:23:31 -0500 | [diff] [blame] | 334 | } |
| 335 | |
| 336 | off_t pack_pos_to_offset(struct packed_git *p, uint32_t pos) |
| 337 | { |
Taylor Blau | 2f4ba2a | 2021-01-25 18:37:14 -0500 | [diff] [blame] | 338 | if (!(p->revindex || p->revindex_data)) |
Taylor Blau | f33fb6e | 2021-01-13 17:23:31 -0500 | [diff] [blame] | 339 | BUG("pack_pos_to_index: reverse index not yet loaded"); |
| 340 | if (p->num_objects < pos) |
| 341 | BUG("pack_pos_to_offset: out-of-bounds object at %"PRIu32, pos); |
Taylor Blau | 2f4ba2a | 2021-01-25 18:37:14 -0500 | [diff] [blame] | 342 | |
| 343 | if (p->revindex) |
| 344 | return p->revindex[pos].offset; |
| 345 | else if (pos == p->num_objects) |
| 346 | return p->pack_size - the_hash_algo->rawsz; |
| 347 | else |
| 348 | return nth_packed_object_offset(p, pack_pos_to_index(p, pos)); |
Taylor Blau | f33fb6e | 2021-01-13 17:23:31 -0500 | [diff] [blame] | 349 | } |