Blame - sha1-lookup.c - jrn/git

blob: c4dc55d1f5cd07adcf46865354f841cc587c51f6 [file] [log] [blame]

Junio C Hamano	628522e	2007-12-29 02:05:47 -0800	[diff] [blame]	1	#include "cache.h"
				2	#include "sha1-lookup.h"
				3
Christian Couder	96beef8	2009-04-04 22:59:26 +0200	[diff] [blame]	4	static uint32_t take2(const unsigned char *sha1)
				5	{
				6	return ((sha1[0] << 8) \| sha1[1]);
				7	}
				8
				9	/*
				10	* Conventional binary search loop looks like this:
				11	*
				12	* do {
				13	* int mi = (lo + hi) / 2;
				14	* int cmp = "entry pointed at by mi" minus "target";
				15	* if (!cmp)
				16	* return (mi is the wanted one)
				17	* if (cmp > 0)
				18	* hi = mi; "mi is larger than target"
				19	* else
				20	* lo = mi+1; "mi is smaller than target"
				21	* } while (lo < hi);
				22	*
				23	* The invariants are:
				24	*
				25	* - When entering the loop, lo points at a slot that is never
				26	* above the target (it could be at the target), hi points at a
				27	* slot that is guaranteed to be above the target (it can never
				28	* be at the target).
				29	*
				30	* - We find a point 'mi' between lo and hi (mi could be the same
				31	* as lo, but never can be the same as hi), and check if it hits
				32	* the target. There are three cases:
				33	*
				34	* - if it is a hit, we are happy.
				35	*
				36	* - if it is strictly higher than the target, we update hi with
				37	* it.
				38	*
				39	* - if it is strictly lower than the target, we update lo to be
				40	* one slot after it, because we allow lo to be at the target.
				41	*
				42	* When choosing 'mi', we do not have to take the "middle" but
				43	* anywhere in between lo and hi, as long as lo <= mi < hi is
				44	* satisfied. When we somehow know that the distance between the
				45	* target and lo is much shorter than the target and hi, we could
				46	* pick mi that is much closer to lo than the midway.
				47	*/
				48	/*
				49	* The table should contain "nr" elements.
				50	* The sha1 of element i (between 0 and nr - 1) should be returned
				51	* by "fn(i, table)".
				52	*/
				53	int sha1_pos(const unsigned char sha1, void table, size_t nr,
				54	sha1_access_fn fn)
				55	{
				56	size_t hi = nr;
				57	size_t lo = 0;
				58	size_t mi = 0;
				59
				60	if (!nr)
				61	return -1;
				62
				63	if (nr != 1) {
				64	size_t lov, hiv, miv, ofs;
				65
				66	for (ofs = 0; ofs < 18; ofs += 2) {
				67	lov = take2(fn(0, table) + ofs);
				68	hiv = take2(fn(nr - 1, table) + ofs);
				69	miv = take2(sha1 + ofs);
				70	if (miv < lov)
				71	return -1;
				72	if (hiv < miv)
				73	return -1 - nr;
				74	if (lov != hiv) {
				75	/*
				76	* At this point miv could be equal
				77	* to hiv (but sha1 could still be higher);
				78	* the invariant of (mi < hi) should be
				79	* kept.
				80	*/
				81	mi = (nr - 1) * (miv - lov) / (hiv - lov);
				82	if (lo <= mi && mi < hi)
				83	break;
Junio C Hamano	1a7b1f6	2009-04-06 00:48:49 -0700	[diff] [blame]	84	die("BUG: assertion failed in binary search");
Christian Couder	96beef8	2009-04-04 22:59:26 +0200	[diff] [blame]	85	}
				86	}
				87	if (18 <= ofs)
				88	die("cannot happen -- lo and hi are identical");
				89	}
				90
				91	do {
				92	int cmp;
				93	cmp = hashcmp(fn(mi, table), sha1);
				94	if (!cmp)
				95	return mi;
				96	if (cmp > 0)
				97	hi = mi;
				98	else
				99	lo = mi + 1;
				100	mi = (hi + lo) / 2;
				101	} while (lo < hi);
				102	return -lo-1;
				103	}
				104
Junio C Hamano	628522e	2007-12-29 02:05:47 -0800	[diff] [blame]	105	/*
				106	* Conventional binary search loop looks like this:
				107	*
				108	* unsigned lo, hi;
				109	* do {
				110	* unsigned mi = (lo + hi) / 2;
				111	* int cmp = "entry pointed at by mi" minus "target";
				112	* if (!cmp)
				113	* return (mi is the wanted one)
				114	* if (cmp > 0)
				115	* hi = mi; "mi is larger than target"
				116	* else
				117	* lo = mi+1; "mi is smaller than target"
				118	* } while (lo < hi);
				119	*
				120	* The invariants are:
				121	*
				122	* - When entering the loop, lo points at a slot that is never
				123	* above the target (it could be at the target), hi points at a
				124	* slot that is guaranteed to be above the target (it can never
				125	* be at the target).
				126	*
				127	* - We find a point 'mi' between lo and hi (mi could be the same
				128	* as lo, but never can be as same as hi), and check if it hits
				129	* the target. There are three cases:
				130	*
				131	* - if it is a hit, we are happy.
				132	*
				133	* - if it is strictly higher than the target, we set it to hi,
				134	* and repeat the search.
				135	*
				136	* - if it is strictly lower than the target, we update lo to
				137	* one slot after it, because we allow lo to be at the target.
				138	*
				139	* If the loop exits, there is no matching entry.
				140	*
				141	* When choosing 'mi', we do not have to take the "middle" but
				142	* anywhere in between lo and hi, as long as lo <= mi < hi is
				143	* satisfied. When we somehow know that the distance between the
				144	* target and lo is much shorter than the target and hi, we could
				145	* pick mi that is much closer to lo than the midway.
				146	*
				147	* Now, we can take advantage of the fact that SHA-1 is a good hash
				148	* function, and as long as there are enough entries in the table, we
				149	* can expect uniform distribution. An entry that begins with for
				150	* example "deadbeef..." is much likely to appear much later than in
				151	* the midway of the table. It can reasonably be expected to be near
				152	* 87% (222/256) from the top of the table.
				153	*
Junio C Hamano	12ecb01	2007-12-30 03:13:27 -0800	[diff] [blame]	154	* However, we do not want to pick "mi" too precisely. If the entry at
				155	* the 87% in the above example turns out to be higher than the target
				156	* we are looking for, we would end up narrowing the search space down
				157	* only by 13%, instead of 50% we would get if we did a simple binary
				158	* search. So we would want to hedge our bets by being less aggressive.
				159	*
Junio C Hamano	628522e	2007-12-29 02:05:47 -0800	[diff] [blame]	160	* The table at "table" holds at least "nr" entries of "elem_size"
				161	* bytes each. Each entry has the SHA-1 key at "key_offset". The
				162	* table is sorted by the SHA-1 key of the entries. The caller wants
				163	* to find the entry with "key", and knows that the entry at "lo" is
				164	* not higher than the entry it is looking for, and that the entry at
				165	* "hi" is higher than the entry it is looking for.
				166	*/
				167	int sha1_entry_pos(const void *table,
				168	size_t elem_size,
				169	size_t key_offset,
				170	unsigned lo, unsigned hi, unsigned nr,
				171	const unsigned char *key)
				172	{
				173	const unsigned char *base = table;
				174	const unsigned char hi_key, lo_key;
				175	unsigned ofs_0;
				176	static int debug_lookup = -1;
				177
				178	if (debug_lookup < 0)
				179	debug_lookup = !!getenv("GIT_DEBUG_LOOKUP");
				180
				181	if (!nr \|\| lo >= hi)
				182	return -1;
				183
				184	if (nr == hi)
				185	hi_key = NULL;
				186	else
				187	hi_key = base + elem_size * hi + key_offset;
				188	lo_key = base + elem_size * lo + key_offset;
				189
				190	ofs_0 = 0;
				191	do {
				192	int cmp;
				193	unsigned ofs, mi, range;
				194	unsigned lov, hiv, kyv;
				195	const unsigned char *mi_key;
				196
				197	range = hi - lo;
				198	if (hi_key) {
				199	for (ofs = ofs_0; ofs < 20; ofs++)
				200	if (lo_key[ofs] != hi_key[ofs])
				201	break;
				202	ofs_0 = ofs;
				203	/*
				204	* byte 0 thru (ofs-1) are the same between
				205	* lo and hi; ofs is the first byte that is
				206	* different.
				207	*/
				208	hiv = hi_key[ofs_0];
				209	if (ofs_0 < 19)
				210	hiv = (hiv << 8) \| hi_key[ofs_0+1];
				211	} else {
				212	hiv = 256;
				213	if (ofs_0 < 19)
				214	hiv <<= 8;
				215	}
				216	lov = lo_key[ofs_0];
				217	kyv = key[ofs_0];
				218	if (ofs_0 < 19) {
				219	lov = (lov << 8) \| lo_key[ofs_0+1];
				220	kyv = (kyv << 8) \| key[ofs_0+1];
				221	}
				222	assert(lov < hiv);
				223
				224	if (kyv < lov)
				225	return -1 - lo;
				226	if (hiv < kyv)
				227	return -1 - hi;
				228
Junio C Hamano	12ecb01	2007-12-30 03:13:27 -0800	[diff] [blame]	229	/*
				230	* Even if we know the target is much closer to 'hi'
				231	* than 'lo', if we pick too precisely and overshoot
				232	* (e.g. when we know 'mi' is closer to 'hi' than to
				233	* 'lo', pick 'mi' that is higher than the target), we
				234	* end up narrowing the search space by a smaller
				235	* amount (i.e. the distance between 'mi' and 'hi')
				236	* than what we would have (i.e. about half of 'lo'
				237	* and 'hi'). Hedge our bets to pick 'mi' less
				238	* aggressively, i.e. make 'mi' a bit closer to the
				239	* middle than we would otherwise pick.
				240	*/
				241	kyv = (kyv * 6 + lov + hiv) / 8;
				242	if (lov < hiv - 1) {
				243	if (kyv == lov)
				244	kyv++;
				245	else if (kyv == hiv)
				246	kyv--;
				247	}
Junio C Hamano	628522e	2007-12-29 02:05:47 -0800	[diff] [blame]	248	mi = (range - 1) * (kyv - lov) / (hiv - lov) + lo;
				249
				250	if (debug_lookup) {
				251	printf("lo %u hi %u rg %u mi %u ", lo, hi, range, mi);
				252	printf("ofs %u lov %x, hiv %x, kyv %x\n",
				253	ofs_0, lov, hiv, kyv);
				254	}
				255	if (!(lo <= mi && mi < hi))
				256	die("assertion failure lo %u mi %u hi %u %s",
				257	lo, mi, hi, sha1_to_hex(key));
				258
				259	mi_key = base + elem_size * mi + key_offset;
				260	cmp = memcmp(mi_key + ofs_0, key + ofs_0, 20 - ofs_0);
				261	if (!cmp)
				262	return mi;
				263	if (cmp > 0) {
				264	hi = mi;
				265	hi_key = mi_key;
Junio C Hamano	12ecb01	2007-12-30 03:13:27 -0800	[diff] [blame]	266	} else {
Junio C Hamano	628522e	2007-12-29 02:05:47 -0800	[diff] [blame]	267	lo = mi + 1;
				268	lo_key = mi_key + elem_size;
				269	}
				270	} while (lo < hi);
				271	return -lo-1;
				272	}