Blame - blame.c - jrn/git

blob: 82fa16d6585b90e3635d87f6adc1e4856524f7d6 [file] [log] [blame]

Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	1	#include "cache.h"
				2	#include "refs.h"
Stefan Beller	cbd53a2	2018-05-15 16:42:15 -0700	[diff] [blame]	3	#include "object-store.h"
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	4	#include "cache-tree.h"
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	5	#include "mergesort.h"
				6	#include "diff.h"
				7	#include "diffcore.h"
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	8	#include "tag.h"
Jeff Smith	f5dd754	2017-05-24 00:15:33 -0500	[diff] [blame]	9	#include "blame.h"
Stefan Beller	14ba97f	2018-05-15 14:48:42 -0700	[diff] [blame]	10	#include "alloc.h"
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	11	#include "commit-slab.h"
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	12	#include "bloom.h"
				13	#include "commit-graph.h"
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	14
				15	define_commit_slab(blame_suspects, struct blame_origin *);
				16	static struct blame_suspects blame_suspects;
				17
				18	struct blame_origin get_blame_suspects(struct commit commit)
				19	{
				20	struct blame_origin **result;
				21
				22	result = blame_suspects_peek(&blame_suspects, commit);
				23
				24	return result ? *result : NULL;
				25	}
				26
				27	static void set_blame_suspects(struct commit commit, struct blame_origin origin)
				28	{
				29	*blame_suspects_at(&blame_suspects, commit) = origin;
				30	}
Jeff Smith	f5dd754	2017-05-24 00:15:33 -0500	[diff] [blame]	31
				32	void blame_origin_decref(struct blame_origin *o)
				33	{
				34	if (o && --o->refcnt <= 0) {
				35	struct blame_origin p, l = NULL;
				36	if (o->previous)
				37	blame_origin_decref(o->previous);
				38	free(o->file.ptr);
				39	/* Should be present exactly once in commit chain */
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	40	for (p = get_blame_suspects(o->commit); p; l = p, p = p->next) {
Jeff Smith	f5dd754	2017-05-24 00:15:33 -0500	[diff] [blame]	41	if (p == o) {
				42	if (l)
				43	l->next = p->next;
				44	else
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	45	set_blame_suspects(o->commit, p->next);
Jeff Smith	f5dd754	2017-05-24 00:15:33 -0500	[diff] [blame]	46	free(o);
				47	return;
				48	}
				49	}
				50	die("internal error in blame_origin_decref");
				51	}
				52	}
				53
				54	/*
				55	* Given a commit and a path in it, create a new origin structure.
				56	* The callers that add blame to the scoreboard should use
				57	* get_origin() to obtain shared, refcounted copy instead of calling
				58	* this function directly.
				59	*/
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	60	static struct blame_origin make_origin(struct commit commit, const char *path)
Jeff Smith	f5dd754	2017-05-24 00:15:33 -0500	[diff] [blame]	61	{
				62	struct blame_origin *o;
				63	FLEX_ALLOC_STR(o, path, path);
				64	o->commit = commit;
				65	o->refcnt = 1;
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	66	o->next = get_blame_suspects(commit);
				67	set_blame_suspects(commit, o);
Jeff Smith	f5dd754	2017-05-24 00:15:33 -0500	[diff] [blame]	68	return o;
				69	}
				70
				71	/*
				72	* Locate an existing origin or create a new one.
				73	* This moves the origin to front position in the commit util list.
				74	*/
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	75	static struct blame_origin get_origin(struct commit commit, const char *path)
Jeff Smith	f5dd754	2017-05-24 00:15:33 -0500	[diff] [blame]	76	{
				77	struct blame_origin o, l;
				78
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	79	for (o = get_blame_suspects(commit), l = NULL; o; l = o, o = o->next) {
Jeff Smith	f5dd754	2017-05-24 00:15:33 -0500	[diff] [blame]	80	if (!strcmp(o->path, path)) {
				81	/* bump to front */
				82	if (l) {
				83	l->next = o->next;
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	84	o->next = get_blame_suspects(commit);
				85	set_blame_suspects(commit, o);
Jeff Smith	f5dd754	2017-05-24 00:15:33 -0500	[diff] [blame]	86	}
				87	return blame_origin_incref(o);
				88	}
				89	}
				90	return make_origin(commit, path);
				91	}
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	92
				93
				94
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	95	static void verify_working_tree_path(struct repository *r,
Nguyễn Thái Ngọc Duy	ecbbc0a	2018-08-13 18:14:41 +0200	[diff] [blame]	96	struct commit work_tree, const char path)
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	97	{
				98	struct commit_list *parents;
				99	int pos;
				100
				101	for (parents = work_tree->parents; parents; parents = parents->next) {
				102	const struct object_id *commit_oid = &parents->item->object.oid;
				103	struct object_id blob_oid;
Elijah Newren	5ec1e72	2019-04-05 08:00:12 -0700	[diff] [blame]	104	unsigned short mode;
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	105
Nguyễn Thái Ngọc Duy	50ddb08	2019-06-27 16:28:49 +0700	[diff] [blame]	106	if (!get_tree_entry(r, commit_oid, path, &blob_oid, &mode) &&
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	107	oid_object_info(r, &blob_oid, NULL) == OBJ_BLOB)
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	108	return;
				109	}
				110
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	111	pos = index_name_pos(r->index, path, strlen(path));
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	112	if (pos >= 0)
				113	; /* path is in the index */
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	114	else if (-1 - pos < r->index->cache_nr &&
				115	!strcmp(r->index->cache[-1 - pos]->name, path))
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	116	; /* path is in the index, unmerged */
				117	else
				118	die("no such path '%s' in HEAD", path);
				119	}
				120
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	121	static struct commit_list *append_parent(struct repository r,
				122	struct commit_list **tail,
				123	const struct object_id *oid)
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	124	{
				125	struct commit *parent;
				126
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	127	parent = lookup_commit_reference(r, oid);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	128	if (!parent)
				129	die("no such commit %s", oid_to_hex(oid));
				130	return &commit_list_insert(parent, tail)->next;
				131	}
				132
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	133	static void append_merge_parents(struct repository *r,
				134	struct commit_list **tail)
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	135	{
				136	int merge_head;
				137	struct strbuf line = STRBUF_INIT;
				138
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	139	merge_head = open(git_path_merge_head(r), O_RDONLY);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	140	if (merge_head < 0) {
				141	if (errno == ENOENT)
				142	return;
Stefan Beller	102de88	2018-05-17 15:51:51 -0700	[diff] [blame]	143	die("cannot open '%s' for reading",
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	144	git_path_merge_head(r));
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	145	}
				146
				147	while (!strbuf_getwholeline_fd(&line, merge_head, '\n')) {
				148	struct object_id oid;
brian m. carlson	fee4930	2019-08-18 20:04:08 +0000	[diff] [blame]	149	if (get_oid_hex(line.buf, &oid))
Stefan Beller	102de88	2018-05-17 15:51:51 -0700	[diff] [blame]	150	die("unknown line in '%s': %s",
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	151	git_path_merge_head(r), line.buf);
				152	tail = append_parent(r, tail, &oid);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	153	}
				154	close(merge_head);
				155	strbuf_release(&line);
				156	}
				157
				158	/*
				159	* This isn't as simple as passing sb->buf and sb->len, because we
				160	* want to transfer ownership of the buffer to the commit (so we
				161	* must use detach).
				162	*/
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	163	static void set_commit_buffer_from_strbuf(struct repository *r,
				164	struct commit *c,
				165	struct strbuf *sb)
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	166	{
				167	size_t len;
				168	void *buf = strbuf_detach(sb, &len);
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	169	set_commit_buffer(r, c, buf, len);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	170	}
				171
				172	/*
				173	* Prepare a dummy commit that represents the work tree (or staged) item.
				174	* Note that annotating work tree item never works in the reverse.
				175	*/
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	176	static struct commit fake_working_tree_commit(struct repository r,
Nguyễn Thái Ngọc Duy	ecbbc0a	2018-08-13 18:14:41 +0200	[diff] [blame]	177	struct diff_options *opt,
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	178	const char *path,
				179	const char *contents_from)
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	180	{
				181	struct commit *commit;
				182	struct blame_origin *origin;
				183	struct commit_list *parent_tail, parent;
				184	struct object_id head_oid;
				185	struct strbuf buf = STRBUF_INIT;
				186	const char *ident;
				187	time_t now;
Jameson Miller	a849735	2018-07-02 19:49:31 +0000	[diff] [blame]	188	int len;
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	189	struct cache_entry *ce;
				190	unsigned mode;
				191	struct strbuf msg = STRBUF_INIT;
				192
Nguyễn Thái Ngọc Duy	e1ff0a3	2019-01-12 09:13:26 +0700	[diff] [blame]	193	repo_read_index(r);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	194	time(&now);
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	195	commit = alloc_commit_node(r);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	196	commit->object.parsed = 1;
				197	commit->date = now;
				198	parent_tail = &commit->parents;
				199
brian m. carlson	49e6147	2017-10-15 22:07:09 +0000	[diff] [blame]	200	if (!resolve_ref_unsafe("HEAD", RESOLVE_REF_READING, &head_oid, NULL))
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	201	die("no such ref: HEAD");
				202
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	203	parent_tail = append_parent(r, parent_tail, &head_oid);
				204	append_merge_parents(r, parent_tail);
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	205	verify_working_tree_path(r, commit, path);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	206
				207	origin = make_origin(commit, path);
				208
William Hubbs	39ab4d0	2019-02-04 12:48:50 -0600	[diff] [blame]	209	ident = fmt_ident("Not Committed Yet", "not.committed.yet",
				210	WANT_BLANK_IDENT, NULL, 0);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	211	strbuf_addstr(&msg, "tree 0000000000000000000000000000000000000000\n");
				212	for (parent = commit->parents; parent; parent = parent->next)
				213	strbuf_addf(&msg, "parent %s\n",
				214	oid_to_hex(&parent->item->object.oid));
				215	strbuf_addf(&msg,
				216	"author %s\n"
				217	"committer %s\n\n"
				218	"Version of %s from %s\n",
				219	ident, ident, path,
				220	(!contents_from ? path :
				221	(!strcmp(contents_from, "-") ? "standard input" : contents_from)));
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	222	set_commit_buffer_from_strbuf(r, commit, &msg);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	223
				224	if (!contents_from \|\| strcmp("-", contents_from)) {
				225	struct stat st;
				226	const char *read_from;
				227	char *buf_ptr;
				228	unsigned long buf_len;
				229
				230	if (contents_from) {
				231	if (stat(contents_from, &st) < 0)
				232	die_errno("Cannot stat '%s'", contents_from);
				233	read_from = contents_from;
				234	}
				235	else {
				236	if (lstat(path, &st) < 0)
				237	die_errno("Cannot lstat '%s'", path);
				238	read_from = path;
				239	}
				240	mode = canon_mode(st.st_mode);
				241
				242	switch (st.st_mode & S_IFMT) {
				243	case S_IFREG:
Brandon Williams	0d1e0e7	2017-10-31 11:19:11 -0700	[diff] [blame]	244	if (opt->flags.allow_textconv &&
Nguyễn Thái Ngọc Duy	6afaf80	2018-09-21 17:57:22 +0200	[diff] [blame]	245	textconv_object(r, read_from, mode, &null_oid, 0, &buf_ptr, &buf_len))
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	246	strbuf_attach(&buf, buf_ptr, buf_len, buf_len + 1);
				247	else if (strbuf_read_file(&buf, read_from, st.st_size) != st.st_size)
				248	die_errno("cannot open or read '%s'", read_from);
				249	break;
				250	case S_IFLNK:
				251	if (strbuf_readlink(&buf, read_from, st.st_size) < 0)
				252	die_errno("cannot readlink '%s'", read_from);
				253	break;
				254	default:
				255	die("unsupported file type %s", read_from);
				256	}
				257	}
				258	else {
				259	/* Reading from stdin */
				260	mode = 0;
				261	if (strbuf_read(&buf, 0, 0) < 0)
				262	die_errno("failed to read from stdin");
				263	}
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	264	convert_to_git(r->index, path, buf.buf, buf.len, &buf, 0);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	265	origin->file.ptr = buf.buf;
				266	origin->file.size = buf.len;
Patryk Obara	829e5c3	2018-01-28 01:13:11 +0100	[diff] [blame]	267	pretend_object_file(buf.buf, buf.len, OBJ_BLOB, &origin->blob_oid);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	268
				269	/*
				270	* Read the current index, replace the path entry with
				271	* origin->blob_sha1 without mucking with its mode or type
				272	* bits; we are not going to write this index out -- we just
				273	* want to run "diff-index --cached".
				274	*/
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	275	discard_index(r->index);
Nguyễn Thái Ngọc Duy	e1ff0a3	2019-01-12 09:13:26 +0700	[diff] [blame]	276	repo_read_index(r);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	277
				278	len = strlen(path);
				279	if (!mode) {
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	280	int pos = index_name_pos(r->index, path, len);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	281	if (0 <= pos)
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	282	mode = r->index->cache[pos]->ce_mode;
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	283	else
				284	/* Let's not bother reading from HEAD tree */
				285	mode = S_IFREG \| 0644;
				286	}
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	287	ce = make_empty_cache_entry(r->index, len);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	288	oidcpy(&ce->oid, &origin->blob_oid);
				289	memcpy(ce->name, path, len);
				290	ce->ce_flags = create_ce_flags(0);
				291	ce->ce_namelen = len;
				292	ce->ce_mode = create_ce_mode(mode);
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	293	add_index_entry(r->index, ce,
Nguyễn Thái Ngọc Duy	ecbbc0a	2018-08-13 18:14:41 +0200	[diff] [blame]	294	ADD_CACHE_OK_TO_ADD \| ADD_CACHE_OK_TO_REPLACE);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	295
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	296	cache_tree_invalidate_path(r->index, path);
Jeff Smith	072bf43	2017-05-24 00:15:34 -0500	[diff] [blame]	297
				298	return commit;
				299	}
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	300
				301
				302
				303	static int diff_hunks(mmfile_t file_a, mmfile_t file_b,
				304	xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, int xdl_opts)
				305	{
				306	xpparam_t xpp = {0};
				307	xdemitconf_t xecfg = {0};
				308	xdemitcb_t ecb = {NULL};
				309
				310	xpp.flags = xdl_opts;
				311	xecfg.hunk_func = hunk_func;
				312	ecb.priv = cb_data;
				313	return xdi_diff(file_a, file_b, &xpp, &xecfg, &ecb);
				314	}
				315
Barret Rhoden	1fc7338	2019-05-15 17:45:01 -0400	[diff] [blame]	316	static const char get_next_line(const char start, const char *end)
				317	{
				318	const char *nl = memchr(start, '\n', end - start);
				319
				320	return nl ? nl + 1 : end;
				321	}
				322
				323	static int find_line_starts(int *line_starts, const char buf,
				324	unsigned long len)
				325	{
				326	const char *end = buf + len;
				327	const char *p;
				328	int *lineno;
				329	int num = 0;
				330
				331	for (p = buf; p < end; p = get_next_line(p, end))
				332	num++;
				333
				334	ALLOC_ARRAY(*line_starts, num + 1);
				335	lineno = *line_starts;
				336
				337	for (p = buf; p < end; p = get_next_line(p, end))
				338	*lineno++ = p - buf;
				339
				340	*lineno = len;
				341
				342	return num;
				343	}
				344
Michael Platings	1d028dc	2019-06-20 12:38:18 -0400	[diff] [blame]	345	struct fingerprint_entry;
				346
				347	/* A fingerprint is intended to loosely represent a string, such that two
				348	* fingerprints can be quickly compared to give an indication of the similarity
				349	* of the strings that they represent.
				350	*
				351	* A fingerprint is represented as a multiset of the lower-cased byte pairs in
				352	* the string that it represents. Whitespace is added at each end of the
				353	* string. Whitespace pairs are ignored. Whitespace is converted to '\0'.
				354	* For example, the string "Darth Radar" will be converted to the following
				355	* fingerprint:
				356	* {"\0d", "da", "da", "ar", "ar", "rt", "th", "h\0", "\0r", "ra", "ad", "r\0"}
				357	*
				358	* The similarity between two fingerprints is the size of the intersection of
				359	* their multisets, including repeated elements. See fingerprint_similarity for
				360	* examples.
				361	*
				362	* For ease of implementation, the fingerprint is implemented as a map
				363	* of byte pairs to the count of that byte pair in the string, instead of
				364	* allowing repeated elements in a set.
				365	*/
				366	struct fingerprint {
				367	struct hashmap map;
				368	/* As we know the maximum number of entries in advance, it's
				369	* convenient to store the entries in a single array instead of having
				370	* the hashmap manage the memory.
				371	*/
				372	struct fingerprint_entry *entries;
				373	};
				374
				375	/* A byte pair in a fingerprint. Stores the number of times the byte pair
				376	* occurs in the string that the fingerprint represents.
				377	*/
				378	struct fingerprint_entry {
				379	/* The hashmap entry - the hash represents the byte pair in its
				380	* entirety so we don't need to store the byte pair separately.
				381	*/
				382	struct hashmap_entry entry;
				383	/* The number of times the byte pair occurs in the string that the
				384	* fingerprint represents.
				385	*/
				386	int count;
				387	};
				388
				389	/* See `struct fingerprint` for an explanation of what a fingerprint is.
				390	* \param result the fingerprint of the string is stored here. This must be
				391	* freed later using free_fingerprint.
				392	* \param line_begin the start of the string
				393	* \param line_end the end of the string
				394	*/
				395	static void get_fingerprint(struct fingerprint *result,
				396	const char *line_begin,
				397	const char *line_end)
				398	{
				399	unsigned int hash, c0 = 0, c1;
				400	const char *p;
				401	int max_map_entry_count = 1 + line_end - line_begin;
				402	struct fingerprint_entry *entry = xcalloc(max_map_entry_count,
				403	sizeof(struct fingerprint_entry));
				404	struct fingerprint_entry *found_entry;
				405
				406	hashmap_init(&result->map, NULL, NULL, max_map_entry_count);
				407	result->entries = entry;
				408	for (p = line_begin; p <= line_end; ++p, c0 = c1) {
				409	/* Always terminate the string with whitespace.
				410	* Normalise whitespace to 0, and normalise letters to
				411	* lower case. This won't work for multibyte characters but at
				412	* worst will match some unrelated characters.
				413	*/
				414	if ((p == line_end) \|\| isspace(*p))
				415	c1 = 0;
				416	else
				417	c1 = tolower(*p);
				418	hash = c0 \| (c1 << 8);
				419	/* Ignore whitespace pairs */
				420	if (hash == 0)
				421	continue;
Eric Wong	d22245a	2019-10-06 23:30:27 +0000	[diff] [blame]	422	hashmap_entry_init(&entry->entry, hash);
Michael Platings	1d028dc	2019-06-20 12:38:18 -0400	[diff] [blame]	423
Eric Wong	404ab78	2019-10-06 23:30:42 +0000	[diff] [blame]	424	found_entry = hashmap_get_entry(&result->map, entry,
				425	/* member name */ entry, NULL);
Michael Platings	1d028dc	2019-06-20 12:38:18 -0400	[diff] [blame]	426	if (found_entry) {
				427	found_entry->count += 1;
				428	} else {
				429	entry->count = 1;
Eric Wong	b94e5c1	2019-10-06 23:30:29 +0000	[diff] [blame]	430	hashmap_add(&result->map, &entry->entry);
Michael Platings	1d028dc	2019-06-20 12:38:18 -0400	[diff] [blame]	431	++entry;
				432	}
				433	}
				434	}
				435
				436	static void free_fingerprint(struct fingerprint *f)
				437	{
Eric Wong	c8e424c	2019-10-06 23:30:40 +0000	[diff] [blame]	438	hashmap_free(&f->map);
Michael Platings	1d028dc	2019-06-20 12:38:18 -0400	[diff] [blame]	439	free(f->entries);
				440	}
				441
				442	/* Calculates the similarity between two fingerprints as the size of the
				443	* intersection of their multisets, including repeated elements. See
				444	* `struct fingerprint` for an explanation of the fingerprint representation.
				445	* The similarity between "cat mat" and "father rather" is 2 because "at" is
				446	* present twice in both strings while the similarity between "tim" and "mit"
				447	* is 0.
				448	*/
				449	static int fingerprint_similarity(struct fingerprint a, struct fingerprint b)
				450	{
				451	int intersection = 0;
				452	struct hashmap_iter iter;
				453	const struct fingerprint_entry entry_a, entry_b;
				454
Eric Wong	87571c3	2019-10-06 23:30:38 +0000	[diff] [blame]	455	hashmap_for_each_entry(&b->map, &iter, entry_b,
Eric Wong	87571c3	2019-10-06 23:30:38 +0000	[diff] [blame]	456	entry /* member name */) {
Eric Wong	404ab78	2019-10-06 23:30:42 +0000	[diff] [blame]	457	entry_a = hashmap_get_entry(&a->map, entry_b, entry, NULL);
Eric Wong	f23a465	2019-10-06 23:30:36 +0000	[diff] [blame]	458	if (entry_a) {
Michael Platings	1d028dc	2019-06-20 12:38:18 -0400	[diff] [blame]	459	intersection += entry_a->count < entry_b->count ?
				460	entry_a->count : entry_b->count;
				461	}
				462	}
				463	return intersection;
				464	}
				465
				466	/* Subtracts byte-pair elements in B from A, modifying A in place.
				467	*/
				468	static void fingerprint_subtract(struct fingerprint a, struct fingerprint b)
				469	{
				470	struct hashmap_iter iter;
				471	struct fingerprint_entry *entry_a;
				472	const struct fingerprint_entry *entry_b;
				473
				474	hashmap_iter_init(&b->map, &iter);
				475
Eric Wong	87571c3	2019-10-06 23:30:38 +0000	[diff] [blame]	476	hashmap_for_each_entry(&b->map, &iter, entry_b,
Eric Wong	87571c3	2019-10-06 23:30:38 +0000	[diff] [blame]	477	entry /* member name */) {
Eric Wong	404ab78	2019-10-06 23:30:42 +0000	[diff] [blame]	478	entry_a = hashmap_get_entry(&a->map, entry_b, entry, NULL);
Eric Wong	f23a465	2019-10-06 23:30:36 +0000	[diff] [blame]	479	if (entry_a) {
Michael Platings	1d028dc	2019-06-20 12:38:18 -0400	[diff] [blame]	480	if (entry_a->count <= entry_b->count)
Eric Wong	28ee794	2019-10-06 23:30:31 +0000	[diff] [blame]	481	hashmap_remove(&a->map, &entry_b->entry, NULL);
Michael Platings	1d028dc	2019-06-20 12:38:18 -0400	[diff] [blame]	482	else
				483	entry_a->count -= entry_b->count;
				484	}
				485	}
				486	}
				487
				488	/* Calculate fingerprints for a series of lines.
				489	* Puts the fingerprints in the fingerprints array, which must have been
				490	* preallocated to allow storing line_count elements.
				491	*/
				492	static void get_line_fingerprints(struct fingerprint *fingerprints,
				493	const char content, const int line_starts,
				494	long first_line, long line_count)
				495	{
				496	int i;
				497	const char linestart, lineend;
				498
				499	line_starts += first_line;
				500	for (i = 0; i < line_count; ++i) {
				501	linestart = content + line_starts[i];
				502	lineend = content + line_starts[i + 1];
				503	get_fingerprint(fingerprints + i, linestart, lineend);
				504	}
				505	}
				506
				507	static void free_line_fingerprints(struct fingerprint *fingerprints,
				508	int nr_fingerprints)
				509	{
				510	int i;
				511
				512	for (i = 0; i < nr_fingerprints; i++)
				513	free_fingerprint(&fingerprints[i]);
				514	}
				515
				516	/* This contains the data necessary to linearly map a line number in one half
				517	* of a diff chunk to the line in the other half of the diff chunk that is
				518	* closest in terms of its position as a fraction of the length of the chunk.
				519	*/
				520	struct line_number_mapping {
				521	int destination_start, destination_length,
				522	source_start, source_length;
				523	};
				524
				525	/* Given a line number in one range, offset and scale it to map it onto the
				526	* other range.
				527	* Essentially this mapping is a simple linear equation but the calculation is
				528	* more complicated to allow performing it with integer operations.
				529	* Another complication is that if a line could map onto many lines in the
				530	* destination range then we want to choose the line at the center of those
				531	* possibilities.
				532	* Example: if the chunk is 2 lines long in A and 10 lines long in B then the
				533	* first 5 lines in B will map onto the first line in the A chunk, while the
				534	* last 5 lines will all map onto the second line in the A chunk.
				535	* Example: if the chunk is 10 lines long in A and 2 lines long in B then line
				536	* 0 in B will map onto line 2 in A, and line 1 in B will map onto line 7 in A.
				537	*/
				538	static int map_line_number(int line_number,
				539	const struct line_number_mapping *mapping)
				540	{
				541	return ((line_number - mapping->source_start) * 2 + 1) *
				542	mapping->destination_length /
				543	(mapping->source_length * 2) +
				544	mapping->destination_start;
				545	}
				546
				547	/* Get a pointer to the element storing the similarity between a line in A
				548	* and a line in B.
				549	*
				550	* The similarities are stored in a 2-dimensional array. Each "row" in the
				551	* array contains the similarities for a line in B. The similarities stored in
				552	* a row are the similarities between the line in B and the nearby lines in A.
				553	* To keep the length of each row the same, it is padded out with values of -1
				554	* where the search range extends beyond the lines in A.
				555	* For example, if max_search_distance_a is 2 and the two sides of a diff chunk
				556	* look like this:
				557	* a \| m
				558	* b \| n
				559	* c \| o
				560	* d \| p
				561	* e \| q
				562	* Then the similarity array will contain:
				563	* [-1, -1, am, bm, cm,
				564	* -1, an, bn, cn, dn,
				565	* ao, bo, co, do, eo,
				566	* bp, cp, dp, ep, -1,
				567	* cq, dq, eq, -1, -1]
				568	* Where similarities are denoted either by -1 for invalid, or the
				569	* concatenation of the two lines in the diff being compared.
				570	*
				571	* \param similarities array of similarities between lines in A and B
				572	* \param line_a the index of the line in A, in the same frame of reference as
				573	* closest_line_a.
				574	* \param local_line_b the index of the line in B, relative to the first line
				575	* in B that similarities represents.
				576	* \param closest_line_a the index of the line in A that is deemed to be
				577	* closest to local_line_b. This must be in the same
				578	* frame of reference as line_a. This value defines
				579	* where similarities is centered for the line in B.
				580	* \param max_search_distance_a maximum distance in lines from the closest line
				581	* in A for other lines in A for which
				582	* similarities may be calculated.
				583	*/
				584	static int get_similarity(int similarities,
				585	int line_a, int local_line_b,
				586	int closest_line_a, int max_search_distance_a)
				587	{
				588	assert(abs(line_a - closest_line_a) <=
				589	max_search_distance_a);
				590	return similarities + line_a - closest_line_a +
				591	max_search_distance_a +
				592	local_line_b * (max_search_distance_a * 2 + 1);
				593	}
				594
				595	#define CERTAIN_NOTHING_MATCHES -2
				596	#define CERTAINTY_NOT_CALCULATED -1
				597
				598	/* Given a line in B, first calculate its similarities with nearby lines in A
				599	* if not already calculated, then identify the most similar and second most
				600	* similar lines. The "certainty" is calculated based on those two
				601	* similarities.
				602	*
				603	* \param start_a the index of the first line of the chunk in A
				604	* \param length_a the length in lines of the chunk in A
				605	* \param local_line_b the index of the line in B, relative to the first line
				606	* in the chunk.
				607	* \param fingerprints_a array of fingerprints for the chunk in A
				608	* \param fingerprints_b array of fingerprints for the chunk in B
				609	* \param similarities 2-dimensional array of similarities between lines in A
				610	* and B. See get_similarity() for more details.
				611	* \param certainties array of values indicating how strongly a line in B is
				612	* matched with some line in A.
				613	* \param second_best_result array of absolute indices in A for the second
				614	* closest match of a line in B.
				615	* \param result array of absolute indices in A for the closest match of a line
				616	* in B.
				617	* \param max_search_distance_a maximum distance in lines from the closest line
				618	* in A for other lines in A for which
				619	* similarities may be calculated.
				620	* \param map_line_number_in_b_to_a parameter to map_line_number().
				621	*/
				622	static void find_best_line_matches(
				623	int start_a,
				624	int length_a,
				625	int start_b,
				626	int local_line_b,
				627	struct fingerprint *fingerprints_a,
				628	struct fingerprint *fingerprints_b,
				629	int *similarities,
				630	int *certainties,
				631	int *second_best_result,
				632	int *result,
				633	const int max_search_distance_a,
				634	const struct line_number_mapping *map_line_number_in_b_to_a)
				635	{
				636
				637	int i, search_start, search_end, closest_local_line_a, *similarity,
				638	best_similarity = 0, second_best_similarity = 0,
				639	best_similarity_index = 0, second_best_similarity_index = 0;
				640
				641	/* certainty has already been calculated so no need to redo the work */
				642	if (certainties[local_line_b] != CERTAINTY_NOT_CALCULATED)
				643	return;
				644
				645	closest_local_line_a = map_line_number(
				646	local_line_b + start_b, map_line_number_in_b_to_a) - start_a;
				647
				648	search_start = closest_local_line_a - max_search_distance_a;
				649	if (search_start < 0)
				650	search_start = 0;
				651
				652	search_end = closest_local_line_a + max_search_distance_a + 1;
				653	if (search_end > length_a)
				654	search_end = length_a;
				655
				656	for (i = search_start; i < search_end; ++i) {
				657	similarity = get_similarity(similarities,
				658	i, local_line_b,
				659	closest_local_line_a,
				660	max_search_distance_a);
				661	if (*similarity == -1) {
				662	/* This value will never exceed 10 but assert just in
				663	* case
				664	*/
				665	assert(abs(i - closest_local_line_a) < 1000);
				666	/* scale the similarity by (1000 - distance from
				667	* closest line) to act as a tie break between lines
				668	* that otherwise are equally similar.
				669	*/
				670	*similarity = fingerprint_similarity(
				671	fingerprints_b + local_line_b,
				672	fingerprints_a + i) *
				673	(1000 - abs(i - closest_local_line_a));
				674	}
				675	if (*similarity > best_similarity) {
				676	second_best_similarity = best_similarity;
				677	second_best_similarity_index = best_similarity_index;
				678	best_similarity = *similarity;
				679	best_similarity_index = i;
				680	} else if (*similarity > second_best_similarity) {
				681	second_best_similarity = *similarity;
				682	second_best_similarity_index = i;
				683	}
				684	}
				685
				686	if (best_similarity == 0) {
				687	/* this line definitely doesn't match with anything. Mark it
				688	* with this special value so it doesn't get invalidated and
				689	* won't be recalculated.
				690	*/
				691	certainties[local_line_b] = CERTAIN_NOTHING_MATCHES;
				692	result[local_line_b] = -1;
				693	} else {
				694	/* Calculate the certainty with which this line matches.
				695	* If the line matches well with two lines then that reduces
				696	* the certainty. However we still want to prioritise matching
				697	* a line that matches very well with two lines over matching a
				698	* line that matches poorly with one line, hence doubling
				699	* best_similarity.
				700	* This means that if we have
				701	* line X that matches only one line with a score of 3,
				702	* line Y that matches two lines equally with a score of 5,
				703	* and line Z that matches only one line with a score or 2,
				704	* then the lines in order of certainty are X, Y, Z.
				705	*/
				706	certainties[local_line_b] = best_similarity * 2 -
				707	second_best_similarity;
				708
				709	/* We keep both the best and second best results to allow us to
				710	* check at a later stage of the matching process whether the
				711	* result needs to be invalidated.
				712	*/
				713	result[local_line_b] = start_a + best_similarity_index;
				714	second_best_result[local_line_b] =
				715	start_a + second_best_similarity_index;
				716	}
				717	}
				718
				719	/*
				720	* This finds the line that we can match with the most confidence, and
				721	* uses it as a partition. It then calls itself on the lines on either side of
				722	* that partition. In this way we avoid lines appearing out of order, and
				723	* retain a sensible line ordering.
				724	* \param start_a index of the first line in A with which lines in B may be
				725	* compared.
				726	* \param start_b index of the first line in B for which matching should be
				727	* done.
				728	* \param length_a number of lines in A with which lines in B may be compared.
				729	* \param length_b number of lines in B for which matching should be done.
				730	* \param fingerprints_a mutable array of fingerprints in A. The first element
				731	* corresponds to the line at start_a.
				732	* \param fingerprints_b array of fingerprints in B. The first element
				733	* corresponds to the line at start_b.
				734	* \param similarities 2-dimensional array of similarities between lines in A
				735	* and B. See get_similarity() for more details.
				736	* \param certainties array of values indicating how strongly a line in B is
				737	* matched with some line in A.
				738	* \param second_best_result array of absolute indices in A for the second
				739	* closest match of a line in B.
				740	* \param result array of absolute indices in A for the closest match of a line
				741	* in B.
				742	* \param max_search_distance_a maximum distance in lines from the closest line
				743	* in A for other lines in A for which
				744	* similarities may be calculated.
				745	* \param max_search_distance_b an upper bound on the greatest possible
				746	* distance between lines in B such that they will
				747	* both be compared with the same line in A
				748	* according to max_search_distance_a.
				749	* \param map_line_number_in_b_to_a parameter to map_line_number().
				750	*/
				751	static void fuzzy_find_matching_lines_recurse(
				752	int start_a, int start_b,
				753	int length_a, int length_b,
				754	struct fingerprint *fingerprints_a,
				755	struct fingerprint *fingerprints_b,
				756	int *similarities,
				757	int *certainties,
				758	int *second_best_result,
				759	int *result,
				760	int max_search_distance_a,
				761	int max_search_distance_b,
				762	const struct line_number_mapping *map_line_number_in_b_to_a)
				763	{
				764	int i, invalidate_min, invalidate_max, offset_b,
				765	second_half_start_a, second_half_start_b,
				766	second_half_length_a, second_half_length_b,
				767	most_certain_line_a, most_certain_local_line_b = -1,
				768	most_certain_line_certainty = -1,
				769	closest_local_line_a;
				770
				771	for (i = 0; i < length_b; ++i) {
				772	find_best_line_matches(start_a,
				773	length_a,
				774	start_b,
				775	i,
				776	fingerprints_a,
				777	fingerprints_b,
				778	similarities,
				779	certainties,
				780	second_best_result,
				781	result,
				782	max_search_distance_a,
				783	map_line_number_in_b_to_a);
				784
				785	if (certainties[i] > most_certain_line_certainty) {
				786	most_certain_line_certainty = certainties[i];
				787	most_certain_local_line_b = i;
				788	}
				789	}
				790
				791	/* No matches. */
				792	if (most_certain_local_line_b == -1)
				793	return;
				794
				795	most_certain_line_a = result[most_certain_local_line_b];
				796
				797	/*
				798	* Subtract the most certain line's fingerprint in B from the matched
				799	* fingerprint in A. This means that other lines in B can't also match
				800	* the same parts of the line in A.
				801	*/
				802	fingerprint_subtract(fingerprints_a + most_certain_line_a - start_a,
				803	fingerprints_b + most_certain_local_line_b);
				804
				805	/* Invalidate results that may be affected by the choice of most
				806	* certain line.
				807	*/
				808	invalidate_min = most_certain_local_line_b - max_search_distance_b;
				809	invalidate_max = most_certain_local_line_b + max_search_distance_b + 1;
				810	if (invalidate_min < 0)
				811	invalidate_min = 0;
				812	if (invalidate_max > length_b)
				813	invalidate_max = length_b;
				814
				815	/* As the fingerprint in A has changed, discard previously calculated
				816	* similarity values with that fingerprint.
				817	*/
				818	for (i = invalidate_min; i < invalidate_max; ++i) {
				819	closest_local_line_a = map_line_number(
				820	i + start_b, map_line_number_in_b_to_a) - start_a;
				821
				822	/* Check that the lines in A and B are close enough that there
				823	* is a similarity value for them.
				824	*/
				825	if (abs(most_certain_line_a - start_a - closest_local_line_a) >
				826	max_search_distance_a) {
				827	continue;
				828	}
				829
				830	*get_similarity(similarities, most_certain_line_a - start_a,
				831	i, closest_local_line_a,
				832	max_search_distance_a) = -1;
				833	}
				834
				835	/* More invalidating of results that may be affected by the choice of
				836	* most certain line.
				837	* Discard the matches for lines in B that are currently matched with a
				838	* line in A such that their ordering contradicts the ordering imposed
				839	* by the choice of most certain line.
				840	*/
				841	for (i = most_certain_local_line_b - 1; i >= invalidate_min; --i) {
				842	/* In this loop we discard results for lines in B that are
				843	* before most-certain-line-B but are matched with a line in A
				844	* that is after most-certain-line-A.
				845	*/
				846	if (certainties[i] >= 0 &&
				847	(result[i] >= most_certain_line_a \|\|
				848	second_best_result[i] >= most_certain_line_a)) {
				849	certainties[i] = CERTAINTY_NOT_CALCULATED;
				850	}
				851	}
				852	for (i = most_certain_local_line_b + 1; i < invalidate_max; ++i) {
				853	/* In this loop we discard results for lines in B that are
				854	* after most-certain-line-B but are matched with a line in A
				855	* that is before most-certain-line-A.
				856	*/
				857	if (certainties[i] >= 0 &&
				858	(result[i] <= most_certain_line_a \|\|
				859	second_best_result[i] <= most_certain_line_a)) {
				860	certainties[i] = CERTAINTY_NOT_CALCULATED;
				861	}
				862	}
				863
				864	/* Repeat the matching process for lines before the most certain line.
				865	*/
				866	if (most_certain_local_line_b > 0) {
				867	fuzzy_find_matching_lines_recurse(
				868	start_a, start_b,
				869	most_certain_line_a + 1 - start_a,
				870	most_certain_local_line_b,
				871	fingerprints_a, fingerprints_b, similarities,
				872	certainties, second_best_result, result,
				873	max_search_distance_a,
				874	max_search_distance_b,
				875	map_line_number_in_b_to_a);
				876	}
				877	/* Repeat the matching process for lines after the most certain line.
				878	*/
				879	if (most_certain_local_line_b + 1 < length_b) {
				880	second_half_start_a = most_certain_line_a;
				881	offset_b = most_certain_local_line_b + 1;
				882	second_half_start_b = start_b + offset_b;
				883	second_half_length_a =
				884	length_a + start_a - second_half_start_a;
				885	second_half_length_b =
				886	length_b + start_b - second_half_start_b;
				887	fuzzy_find_matching_lines_recurse(
				888	second_half_start_a, second_half_start_b,
				889	second_half_length_a, second_half_length_b,
				890	fingerprints_a + second_half_start_a - start_a,
				891	fingerprints_b + offset_b,
				892	similarities +
				893	offset_b * (max_search_distance_a * 2 + 1),
				894	certainties + offset_b,
				895	second_best_result + offset_b, result + offset_b,
				896	max_search_distance_a,
				897	max_search_distance_b,
				898	map_line_number_in_b_to_a);
				899	}
				900	}
				901
				902	/* Find the lines in the parent line range that most closely match the lines in
				903	* the target line range. This is accomplished by matching fingerprints in each
				904	* blame_origin, and choosing the best matches that preserve the line ordering.
				905	* See struct fingerprint for details of fingerprint matching, and
				906	* fuzzy_find_matching_lines_recurse for details of preserving line ordering.
				907	*
				908	* The performance is believed to be O(n log n) in the typical case and O(n^2)
				909	* in a pathological case, where n is the number of lines in the target range.
				910	*/
				911	static int fuzzy_find_matching_lines(struct blame_origin parent,
				912	struct blame_origin *target,
				913	int tlno, int parent_slno, int same,
				914	int parent_len)
				915	{
				916	/* We use the terminology "A" for the left hand side of the diff AKA
				917	* parent, and "B" for the right hand side of the diff AKA target. */
				918	int start_a = parent_slno;
				919	int length_a = parent_len;
				920	int start_b = tlno;
				921	int length_b = same - tlno;
				922
				923	struct line_number_mapping map_line_number_in_b_to_a = {
				924	start_a, length_a, start_b, length_b
				925	};
				926
				927	struct fingerprint *fingerprints_a = parent->fingerprints;
				928	struct fingerprint *fingerprints_b = target->fingerprints;
				929
				930	int i, result, second_best_result,
				931	certainties, similarities, similarity_count;
				932
				933	/*
				934	* max_search_distance_a means that given a line in B, compare it to
				935	* the line in A that is closest to its position, and the lines in A
				936	* that are no greater than max_search_distance_a lines away from the
				937	* closest line in A.
				938	*
				939	* max_search_distance_b is an upper bound on the greatest possible
				940	* distance between lines in B such that they will both be compared
				941	* with the same line in A according to max_search_distance_a.
				942	*/
				943	int max_search_distance_a = 10, max_search_distance_b;
				944
				945	if (length_a <= 0)
				946	return NULL;
				947
				948	if (max_search_distance_a >= length_a)
				949	max_search_distance_a = length_a ? length_a - 1 : 0;
				950
				951	max_search_distance_b = ((2 * max_search_distance_a + 1) * length_b
				952	- 1) / length_a;
				953
				954	result = xcalloc(sizeof(int), length_b);
				955	second_best_result = xcalloc(sizeof(int), length_b);
				956	certainties = xcalloc(sizeof(int), length_b);
				957
				958	/* See get_similarity() for details of similarities. */
				959	similarity_count = length_b * (max_search_distance_a * 2 + 1);
				960	similarities = xcalloc(sizeof(int), similarity_count);
				961
				962	for (i = 0; i < length_b; ++i) {
				963	result[i] = -1;
				964	second_best_result[i] = -1;
				965	certainties[i] = CERTAINTY_NOT_CALCULATED;
				966	}
				967
				968	for (i = 0; i < similarity_count; ++i)
				969	similarities[i] = -1;
				970
				971	fuzzy_find_matching_lines_recurse(start_a, start_b,
				972	length_a, length_b,
				973	fingerprints_a + start_a,
				974	fingerprints_b + start_b,
				975	similarities,
				976	certainties,
				977	second_best_result,
				978	result,
				979	max_search_distance_a,
				980	max_search_distance_b,
				981	&map_line_number_in_b_to_a);
				982
				983	free(similarities);
				984	free(certainties);
				985	free(second_best_result);
				986
				987	return result;
				988	}
				989
Jeff King	07a54dc	2019-06-28 02:24:57 -0400	[diff] [blame]	990	static void fill_origin_fingerprints(struct blame_origin *o)
Barret Rhoden	1fc7338	2019-05-15 17:45:01 -0400	[diff] [blame]	991	{
				992	int *line_starts;
				993
				994	if (o->fingerprints)
				995	return;
				996	o->num_lines = find_line_starts(&line_starts, o->file.ptr,
				997	o->file.size);
Barret Rhoden	a07a977	2019-06-20 12:38:19 -0400	[diff] [blame]	998	o->fingerprints = xcalloc(sizeof(struct fingerprint), o->num_lines);
				999	get_line_fingerprints(o->fingerprints, o->file.ptr, line_starts,
				1000	0, o->num_lines);
Barret Rhoden	1fc7338	2019-05-15 17:45:01 -0400	[diff] [blame]	1001	free(line_starts);
				1002	}
				1003
				1004	static void drop_origin_fingerprints(struct blame_origin *o)
				1005	{
Barret Rhoden	a07a977	2019-06-20 12:38:19 -0400	[diff] [blame]	1006	if (o->fingerprints) {
				1007	free_line_fingerprints(o->fingerprints, o->num_lines);
				1008	o->num_lines = 0;
				1009	FREE_AND_NULL(o->fingerprints);
				1010	}
Barret Rhoden	1fc7338	2019-05-15 17:45:01 -0400	[diff] [blame]	1011	}
				1012
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1013	/*
				1014	* Given an origin, prepare mmfile_t structure to be used by the
				1015	* diff machinery
				1016	*/
				1017	static void fill_origin_blob(struct diff_options *opt,
Barret Rhoden	1fc7338	2019-05-15 17:45:01 -0400	[diff] [blame]	1018	struct blame_origin o, mmfile_t file,
				1019	int *num_read_blob, int fill_fingerprints)
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1020	{
				1021	if (!o->file.ptr) {
				1022	enum object_type type;
				1023	unsigned long file_size;
				1024
				1025	(*num_read_blob)++;
Brandon Williams	0d1e0e7	2017-10-31 11:19:11 -0700	[diff] [blame]	1026	if (opt->flags.allow_textconv &&
Nguyễn Thái Ngọc Duy	6afaf80	2018-09-21 17:57:22 +0200	[diff] [blame]	1027	textconv_object(opt->repo, o->path, o->mode,
				1028	&o->blob_oid, 1, &file->ptr, &file_size))
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1029	;
				1030	else
brian m. carlson	b4f5aca	2018-03-12 02:27:53 +0000	[diff] [blame]	1031	file->ptr = read_object_file(&o->blob_oid, &type,
				1032	&file_size);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1033	file->size = file_size;
				1034
				1035	if (!file->ptr)
				1036	die("Cannot read blob %s for path %s",
				1037	oid_to_hex(&o->blob_oid),
				1038	o->path);
				1039	o->file = *file;
				1040	}
				1041	else
				1042	*file = o->file;
Barret Rhoden	1fc7338	2019-05-15 17:45:01 -0400	[diff] [blame]	1043	if (fill_fingerprints)
Jeff King	07a54dc	2019-06-28 02:24:57 -0400	[diff] [blame]	1044	fill_origin_fingerprints(o);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1045	}
				1046
				1047	static void drop_origin_blob(struct blame_origin *o)
				1048	{
Ævar Arnfjörð Bjarmason	ce528de	2018-08-17 13:02:50 +0000	[diff] [blame]	1049	FREE_AND_NULL(o->file.ptr);
Barret Rhoden	1fc7338	2019-05-15 17:45:01 -0400	[diff] [blame]	1050	drop_origin_fingerprints(o);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1051	}
				1052
				1053	/*
				1054	* Any merge of blames happens on lists of blames that arrived via
				1055	* different parents in a single suspect. In this case, we want to
				1056	* sort according to the suspect line numbers as opposed to the final
				1057	* image line numbers. The function body is somewhat longish because
				1058	* it avoids unnecessary writes.
				1059	*/
				1060
				1061	static struct blame_entry blame_merge(struct blame_entry list1,
				1062	struct blame_entry *list2)
				1063	{
				1064	struct blame_entry p1 = list1, p2 = list2,
				1065	**tail = &list1;
				1066
				1067	if (!p1)
				1068	return p2;
				1069	if (!p2)
				1070	return p1;
				1071
				1072	if (p1->s_lno <= p2->s_lno) {
				1073	do {
				1074	tail = &p1->next;
				1075	if ((p1 = *tail) == NULL) {
				1076	*tail = p2;
				1077	return list1;
				1078	}
				1079	} while (p1->s_lno <= p2->s_lno);
				1080	}
				1081	for (;;) {
				1082	*tail = p2;
				1083	do {
				1084	tail = &p2->next;
				1085	if ((p2 = *tail) == NULL) {
				1086	*tail = p1;
				1087	return list1;
				1088	}
				1089	} while (p1->s_lno > p2->s_lno);
				1090	*tail = p1;
				1091	do {
				1092	tail = &p1->next;
				1093	if ((p1 = *tail) == NULL) {
				1094	*tail = p2;
				1095	return list1;
				1096	}
				1097	} while (p1->s_lno <= p2->s_lno);
				1098	}
				1099	}
				1100
				1101	static void get_next_blame(const void p)
				1102	{
				1103	return ((struct blame_entry *)p)->next;
				1104	}
				1105
				1106	static void set_next_blame(void p1, void p2)
				1107	{
				1108	((struct blame_entry *)p1)->next = p2;
				1109	}
				1110
				1111	/*
				1112	* Final image line numbers are all different, so we don't need a
				1113	* three-way comparison here.
				1114	*/
				1115
				1116	static int compare_blame_final(const void p1, const void p2)
				1117	{
				1118	return ((struct blame_entry )p1)->lno > ((struct blame_entry )p2)->lno
				1119	? 1 : -1;
				1120	}
				1121
				1122	static int compare_blame_suspect(const void p1, const void p2)
				1123	{
				1124	const struct blame_entry s1 = p1, s2 = p2;
				1125	/*
				1126	* to allow for collating suspects, we sort according to the
				1127	* respective pointer value as the primary sorting criterion.
				1128	* The actual relation is pretty unimportant as long as it
				1129	* establishes a total order. Comparing as integers gives us
				1130	* that.
				1131	*/
				1132	if (s1->suspect != s2->suspect)
				1133	return (intptr_t)s1->suspect > (intptr_t)s2->suspect ? 1 : -1;
				1134	if (s1->s_lno == s2->s_lno)
				1135	return 0;
				1136	return s1->s_lno > s2->s_lno ? 1 : -1;
				1137	}
				1138
				1139	void blame_sort_final(struct blame_scoreboard *sb)
				1140	{
				1141	sb->ent = llist_mergesort(sb->ent, get_next_blame, set_next_blame,
				1142	compare_blame_final);
				1143	}
				1144
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	1145	static int compare_commits_by_reverse_commit_date(const void *a,
				1146	const void *b,
				1147	void *c)
				1148	{
				1149	return -compare_commits_by_commit_date(a, b, c);
				1150	}
				1151
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1152	/*
				1153	* For debugging -- origin is refcounted, and this asserts that
				1154	* we do not underflow.
				1155	*/
				1156	static void sanity_check_refcnt(struct blame_scoreboard *sb)
				1157	{
				1158	int baa = 0;
				1159	struct blame_entry *ent;
				1160
				1161	for (ent = sb->ent; ent; ent = ent->next) {
				1162	/* Nobody should have zero or negative refcnt */
				1163	if (ent->suspect->refcnt <= 0) {
				1164	fprintf(stderr, "%s in %s has negative refcnt %d\n",
				1165	ent->suspect->path,
				1166	oid_to_hex(&ent->suspect->commit->object.oid),
				1167	ent->suspect->refcnt);
				1168	baa = 1;
				1169	}
				1170	}
				1171	if (baa)
				1172	sb->on_sanity_fail(sb, baa);
				1173	}
				1174
				1175	/*
				1176	* If two blame entries that are next to each other came from
				1177	* contiguous lines in the same origin (i.e. <commit, path> pair),
				1178	* merge them together.
				1179	*/
				1180	void blame_coalesce(struct blame_scoreboard *sb)
				1181	{
				1182	struct blame_entry ent, next;
				1183
				1184	for (ent = sb->ent; ent && (next = ent->next); ent = next) {
				1185	if (ent->suspect == next->suspect &&
Barret Rhoden	8934ac8	2019-05-15 17:45:00 -0400	[diff] [blame]	1186	ent->s_lno + ent->num_lines == next->s_lno &&
				1187	ent->ignored == next->ignored &&
				1188	ent->unblamable == next->unblamable) {
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1189	ent->num_lines += next->num_lines;
				1190	ent->next = next->next;
				1191	blame_origin_decref(next->suspect);
				1192	free(next);
				1193	ent->score = 0;
				1194	next = ent; /* again */
				1195	}
				1196	}
				1197
				1198	if (sb->debug) /* sanity */
				1199	sanity_check_refcnt(sb);
				1200	}
				1201
				1202	/*
				1203	* Merge the given sorted list of blames into a preexisting origin.
				1204	* If there were no previous blames to that commit, it is entered into
				1205	* the commit priority queue of the score board.
				1206	*/
				1207
				1208	static void queue_blames(struct blame_scoreboard sb, struct blame_origin porigin,
				1209	struct blame_entry *sorted)
				1210	{
				1211	if (porigin->suspects)
				1212	porigin->suspects = blame_merge(porigin->suspects, sorted);
				1213	else {
				1214	struct blame_origin *o;
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	1215	for (o = get_blame_suspects(porigin->commit); o; o = o->next) {
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1216	if (o->suspects) {
				1217	porigin->suspects = sorted;
				1218	return;
				1219	}
				1220	}
				1221	porigin->suspects = sorted;
				1222	prio_queue_put(&sb->commits, porigin->commit);
				1223	}
				1224	}
				1225
				1226	/*
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	1227	* Fill the blob_sha1 field of an origin if it hasn't, so that later
				1228	* call to fill_origin_blob() can use it to locate the data. blob_sha1
				1229	* for an origin is also used to pass the blame for the entire file to
				1230	* the parent to detect the case where a child's blob is identical to
				1231	* that of its parent's.
				1232	*
				1233	* This also fills origin->mode for corresponding tree path.
				1234	*/
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	1235	static int fill_blob_sha1_and_mode(struct repository *r,
Nguyễn Thái Ngọc Duy	ecbbc0a	2018-08-13 18:14:41 +0200	[diff] [blame]	1236	struct blame_origin *origin)
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	1237	{
				1238	if (!is_null_oid(&origin->blob_oid))
				1239	return 0;
Nguyễn Thái Ngọc Duy	50ddb08	2019-06-27 16:28:49 +0700	[diff] [blame]	1240	if (get_tree_entry(r, &origin->commit->object.oid, origin->path, &origin->blob_oid, &origin->mode))
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	1241	goto error_out;
Nguyễn Thái Ngọc Duy	a470bee	2018-09-21 17:57:21 +0200	[diff] [blame]	1242	if (oid_object_info(r, &origin->blob_oid, NULL) != OBJ_BLOB)
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	1243	goto error_out;
				1244	return 0;
				1245	error_out:
				1246	oidclr(&origin->blob_oid);
				1247	origin->mode = S_IFINVALID;
				1248	return -1;
				1249	}
				1250
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	1251	struct blame_bloom_data {
				1252	/*
				1253	* Changed-path Bloom filter keys. These can help prevent
				1254	* computing diffs against first parents, but we need to
				1255	* expand the list as code is moved or files are renamed.
				1256	*/
				1257	struct bloom_filter_settings *settings;
				1258	struct bloom_key **keys;
				1259	int nr;
				1260	int alloc;
				1261	};
				1262
				1263	static int bloom_count_queries = 0;
				1264	static int bloom_count_no = 0;
				1265	static int maybe_changed_path(struct repository *r,
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	1266	struct blame_origin *origin,
				1267	struct blame_bloom_data *bd)
				1268	{
				1269	int i;
				1270	struct bloom_filter *filter;
				1271
				1272	if (!bd)
				1273	return 1;
				1274
Abhishek Kumar	c49c82a	2020-06-17 14:44:10 +0530	[diff] [blame]	1275	if (commit_graph_generation(origin->commit) == GENERATION_NUMBER_INFINITY)
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	1276	return 1;
				1277
				1278	filter = get_bloom_filter(r, origin->commit, 0);
				1279
				1280	if (!filter)
				1281	return 1;
				1282
				1283	bloom_count_queries++;
				1284	for (i = 0; i < bd->nr; i++) {
				1285	if (bloom_filter_contains(filter,
				1286	bd->keys[i],
				1287	bd->settings))
				1288	return 1;
				1289	}
				1290
				1291	bloom_count_no++;
				1292	return 0;
				1293	}
				1294
				1295	static void add_bloom_key(struct blame_bloom_data *bd,
				1296	const char *path)
				1297	{
				1298	if (!bd)
				1299	return;
				1300
				1301	if (bd->nr >= bd->alloc) {
				1302	bd->alloc *= 2;
				1303	REALLOC_ARRAY(bd->keys, bd->alloc);
				1304	}
				1305
				1306	bd->keys[bd->nr] = xmalloc(sizeof(struct bloom_key));
				1307	fill_bloom_key(path, strlen(path), bd->keys[bd->nr], bd->settings);
				1308	bd->nr++;
				1309	}
				1310
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	1311	/*
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1312	* We have an origin -- check if the same path exists in the
				1313	* parent and return an origin structure to represent it.
				1314	*/
Nguyễn Thái Ngọc Duy	e675765	2018-09-21 17:57:24 +0200	[diff] [blame]	1315	static struct blame_origin find_origin(struct repository r,
				1316	struct commit *parent,
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	1317	struct blame_origin *origin,
				1318	struct blame_bloom_data *bd)
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1319	{
				1320	struct blame_origin *porigin;
				1321	struct diff_options diff_opts;
				1322	const char *paths[2];
				1323
				1324	/* First check any existing origins */
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	1325	for (porigin = get_blame_suspects(parent); porigin; porigin = porigin->next)
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1326	if (!strcmp(porigin->path, origin->path)) {
				1327	/*
				1328	* The same path between origin and its parent
				1329	* without renaming -- the most common case.
				1330	*/
				1331	return blame_origin_incref (porigin);
				1332	}
				1333
				1334	/* See if the origin->path is different between parent
				1335	* and origin first. Most of the time they are the
				1336	* same and diff-tree is fairly efficient about this.
				1337	*/
Nguyễn Thái Ngọc Duy	e675765	2018-09-21 17:57:24 +0200	[diff] [blame]	1338	repo_diff_setup(r, &diff_opts);
Brandon Williams	0d1e0e7	2017-10-31 11:19:11 -0700	[diff] [blame]	1339	diff_opts.flags.recursive = 1;
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1340	diff_opts.detect_rename = 0;
				1341	diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT;
				1342	paths[0] = origin->path;
				1343	paths[1] = NULL;
				1344
				1345	parse_pathspec(&diff_opts.pathspec,
				1346	PATHSPEC_ALL_MAGIC & ~PATHSPEC_LITERAL,
				1347	PATHSPEC_LITERAL_PATH, "", paths);
				1348	diff_setup_done(&diff_opts);
				1349
				1350	if (is_null_oid(&origin->commit->object.oid))
Derrick Stolee	2e27bd7	2018-04-06 19:09:38 +0000	[diff] [blame]	1351	do_diff_cache(get_commit_tree_oid(parent), &diff_opts);
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	1352	else {
				1353	int compute_diff = 1;
				1354	if (origin->commit->parents &&
				1355	!oidcmp(&parent->object.oid,
				1356	&origin->commit->parents->item->object.oid))
Jeff King	fe88f9f	2020-04-23 17:03:03 -0400	[diff] [blame]	1357	compute_diff = maybe_changed_path(r, origin, bd);
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	1358
				1359	if (compute_diff)
				1360	diff_tree_oid(get_commit_tree_oid(parent),
				1361	get_commit_tree_oid(origin->commit),
				1362	"", &diff_opts);
				1363	}
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1364	diffcore_std(&diff_opts);
				1365
				1366	if (!diff_queued_diff.nr) {
				1367	/* The path is the same as parent */
				1368	porigin = get_origin(parent, origin->path);
				1369	oidcpy(&porigin->blob_oid, &origin->blob_oid);
				1370	porigin->mode = origin->mode;
				1371	} else {
				1372	/*
				1373	* Since origin->path is a pathspec, if the parent
				1374	* commit had it as a directory, we will see a whole
				1375	* bunch of deletion of files in the directory that we
				1376	* do not care about.
				1377	*/
				1378	int i;
				1379	struct diff_filepair *p = NULL;
				1380	for (i = 0; i < diff_queued_diff.nr; i++) {
				1381	const char *name;
				1382	p = diff_queued_diff.queue[i];
				1383	name = p->one->path ? p->one->path : p->two->path;
				1384	if (!strcmp(name, origin->path))
				1385	break;
				1386	}
				1387	if (!p)
				1388	die("internal error in blame::find_origin");
				1389	switch (p->status) {
				1390	default:
				1391	die("internal error in blame::find_origin (%c)",
				1392	p->status);
				1393	case 'M':
				1394	porigin = get_origin(parent, origin->path);
				1395	oidcpy(&porigin->blob_oid, &p->one->oid);
				1396	porigin->mode = p->one->mode;
				1397	break;
				1398	case 'A':
				1399	case 'T':
				1400	/* Did not exist in parent, or type changed */
				1401	break;
				1402	}
				1403	}
				1404	diff_flush(&diff_opts);
				1405	clear_pathspec(&diff_opts.pathspec);
				1406	return porigin;
				1407	}
				1408
				1409	/*
				1410	* We have an origin -- find the path that corresponds to it in its
				1411	* parent and return an origin structure to represent it.
				1412	*/
Nguyễn Thái Ngọc Duy	e675765	2018-09-21 17:57:24 +0200	[diff] [blame]	1413	static struct blame_origin find_rename(struct repository r,
				1414	struct commit *parent,
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	1415	struct blame_origin *origin,
				1416	struct blame_bloom_data *bd)
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1417	{
				1418	struct blame_origin *porigin = NULL;
				1419	struct diff_options diff_opts;
				1420	int i;
				1421
Nguyễn Thái Ngọc Duy	e675765	2018-09-21 17:57:24 +0200	[diff] [blame]	1422	repo_diff_setup(r, &diff_opts);
Brandon Williams	0d1e0e7	2017-10-31 11:19:11 -0700	[diff] [blame]	1423	diff_opts.flags.recursive = 1;
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1424	diff_opts.detect_rename = DIFF_DETECT_RENAME;
				1425	diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT;
				1426	diff_opts.single_follow = origin->path;
				1427	diff_setup_done(&diff_opts);
				1428
				1429	if (is_null_oid(&origin->commit->object.oid))
Derrick Stolee	2e27bd7	2018-04-06 19:09:38 +0000	[diff] [blame]	1430	do_diff_cache(get_commit_tree_oid(parent), &diff_opts);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1431	else
Derrick Stolee	2e27bd7	2018-04-06 19:09:38 +0000	[diff] [blame]	1432	diff_tree_oid(get_commit_tree_oid(parent),
				1433	get_commit_tree_oid(origin->commit),
Junio C Hamano	a6f38c1	2017-06-19 12:38:44 -0700	[diff] [blame]	1434	"", &diff_opts);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1435	diffcore_std(&diff_opts);
				1436
				1437	for (i = 0; i < diff_queued_diff.nr; i++) {
				1438	struct diff_filepair *p = diff_queued_diff.queue[i];
				1439	if ((p->status == 'R' \|\| p->status == 'C') &&
				1440	!strcmp(p->two->path, origin->path)) {
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	1441	add_bloom_key(bd, p->one->path);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1442	porigin = get_origin(parent, p->one->path);
				1443	oidcpy(&porigin->blob_oid, &p->one->oid);
				1444	porigin->mode = p->one->mode;
				1445	break;
				1446	}
				1447	}
				1448	diff_flush(&diff_opts);
				1449	clear_pathspec(&diff_opts.pathspec);
				1450	return porigin;
				1451	}
				1452
				1453	/*
				1454	* Append a new blame entry to a given output queue.
				1455	*/
				1456	static void add_blame_entry(struct blame_entry ***queue,
				1457	const struct blame_entry *src)
				1458	{
				1459	struct blame_entry e = xmalloc(sizeof(e));
				1460	memcpy(e, src, sizeof(*e));
				1461	blame_origin_incref(e->suspect);
				1462
				1463	e->next = **queue;
				1464	**queue = e;
				1465	*queue = &e->next;
				1466	}
				1467
				1468	/*
				1469	* src typically is on-stack; we want to copy the information in it to
				1470	* a malloced blame_entry that gets added to the given queue. The
				1471	* origin of dst loses a refcnt.
				1472	*/
				1473	static void dup_entry(struct blame_entry ***queue,
				1474	struct blame_entry dst, struct blame_entry src)
				1475	{
				1476	blame_origin_incref(src->suspect);
				1477	blame_origin_decref(dst->suspect);
				1478	memcpy(dst, src, sizeof(*src));
				1479	dst->next = **queue;
				1480	**queue = dst;
				1481	*queue = &dst->next;
				1482	}
				1483
				1484	const char blame_nth_line(struct blame_scoreboard sb, long lno)
				1485	{
				1486	return sb->final_buf + sb->lineno[lno];
				1487	}
				1488
				1489	/*
				1490	* It is known that lines between tlno to same came from parent, and e
				1491	* has an overlap with that range. it also is known that parent's
				1492	* line plno corresponds to e's line tlno.
				1493	*
				1494	* <---- e ----->
				1495	* <------>
				1496	* <------------>
				1497	* <------------>
				1498	* <------------------>
				1499	*
				1500	* Split e into potentially three parts; before this chunk, the chunk
				1501	* to be blamed for the parent, and after that portion.
				1502	*/
				1503	static void split_overlap(struct blame_entry *split,
				1504	struct blame_entry *e,
				1505	int tlno, int plno, int same,
				1506	struct blame_origin *parent)
				1507	{
				1508	int chunk_end_lno;
Barret Rhoden	8934ac8	2019-05-15 17:45:00 -0400	[diff] [blame]	1509	int i;
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1510	memset(split, 0, sizeof(struct blame_entry [3]));
				1511
Barret Rhoden	8934ac8	2019-05-15 17:45:00 -0400	[diff] [blame]	1512	for (i = 0; i < 3; i++) {
				1513	split[i].ignored = e->ignored;
				1514	split[i].unblamable = e->unblamable;
				1515	}
				1516
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1517	if (e->s_lno < tlno) {
				1518	/* there is a pre-chunk part not blamed on parent */
				1519	split[0].suspect = blame_origin_incref(e->suspect);
				1520	split[0].lno = e->lno;
				1521	split[0].s_lno = e->s_lno;
				1522	split[0].num_lines = tlno - e->s_lno;
				1523	split[1].lno = e->lno + tlno - e->s_lno;
				1524	split[1].s_lno = plno;
				1525	}
				1526	else {
				1527	split[1].lno = e->lno;
				1528	split[1].s_lno = plno + (e->s_lno - tlno);
				1529	}
				1530
				1531	if (same < e->s_lno + e->num_lines) {
				1532	/* there is a post-chunk part not blamed on parent */
				1533	split[2].suspect = blame_origin_incref(e->suspect);
				1534	split[2].lno = e->lno + (same - e->s_lno);
				1535	split[2].s_lno = e->s_lno + (same - e->s_lno);
				1536	split[2].num_lines = e->s_lno + e->num_lines - same;
				1537	chunk_end_lno = split[2].lno;
				1538	}
				1539	else
				1540	chunk_end_lno = e->lno + e->num_lines;
				1541	split[1].num_lines = chunk_end_lno - split[1].lno;
				1542
				1543	/*
				1544	* if it turns out there is nothing to blame the parent for,
				1545	* forget about the splitting. !split[1].suspect signals this.
				1546	*/
				1547	if (split[1].num_lines < 1)
				1548	return;
				1549	split[1].suspect = blame_origin_incref(parent);
				1550	}
				1551
				1552	/*
				1553	* split_overlap() divided an existing blame e into up to three parts
				1554	* in split. Any assigned blame is moved to queue to
				1555	* reflect the split.
				1556	*/
				1557	static void split_blame(struct blame_entry ***blamed,
				1558	struct blame_entry ***unblamed,
				1559	struct blame_entry *split,
				1560	struct blame_entry *e)
				1561	{
				1562	if (split[0].suspect && split[2].suspect) {
				1563	/* The first part (reuse storage for the existing entry e) */
				1564	dup_entry(unblamed, e, &split[0]);
				1565
				1566	/* The last part -- me */
				1567	add_blame_entry(unblamed, &split[2]);
				1568
				1569	/* ... and the middle part -- parent */
				1570	add_blame_entry(blamed, &split[1]);
				1571	}
				1572	else if (!split[0].suspect && !split[2].suspect)
				1573	/*
				1574	* The parent covers the entire area; reuse storage for
				1575	* e and replace it with the parent.
				1576	*/
				1577	dup_entry(blamed, e, &split[1]);
				1578	else if (split[0].suspect) {
				1579	/* me and then parent */
				1580	dup_entry(unblamed, e, &split[0]);
				1581	add_blame_entry(blamed, &split[1]);
				1582	}
				1583	else {
				1584	/* parent and then me */
				1585	dup_entry(blamed, e, &split[1]);
				1586	add_blame_entry(unblamed, &split[2]);
				1587	}
				1588	}
				1589
				1590	/*
				1591	* After splitting the blame, the origins used by the
				1592	* on-stack blame_entry should lose one refcnt each.
				1593	*/
				1594	static void decref_split(struct blame_entry *split)
				1595	{
				1596	int i;
				1597
				1598	for (i = 0; i < 3; i++)
				1599	blame_origin_decref(split[i].suspect);
				1600	}
				1601
				1602	/*
				1603	* reverse_blame reverses the list given in head, appending tail.
				1604	* That allows us to build lists in reverse order, then reverse them
				1605	* afterwards. This can be faster than building the list in proper
				1606	* order right away. The reason is that building in proper order
				1607	* requires writing a link in the _previous_ element, while building
				1608	* in reverse order just requires placing the list head into the
				1609	* _current_ element.
				1610	*/
				1611
				1612	static struct blame_entry reverse_blame(struct blame_entry head,
				1613	struct blame_entry *tail)
				1614	{
				1615	while (head) {
				1616	struct blame_entry *next = head->next;
				1617	head->next = tail;
				1618	tail = head;
				1619	head = next;
				1620	}
				1621	return tail;
				1622	}
				1623
				1624	/*
Barret Rhoden	55f808f	2019-05-15 17:44:58 -0400	[diff] [blame]	1625	* Splits a blame entry into two entries at 'len' lines. The original 'e'
				1626	* consists of len lines, i.e. [e->lno, e->lno + len), and the second part,
				1627	* which is returned, consists of the remainder: [e->lno + len, e->lno +
				1628	* e->num_lines). The caller needs to sort out the reference counting for the
				1629	* new entry's suspect.
				1630	*/
				1631	static struct blame_entry split_blame_at(struct blame_entry e, int len,
				1632	struct blame_origin *new_suspect)
				1633	{
				1634	struct blame_entry *n = xcalloc(1, sizeof(struct blame_entry));
				1635
				1636	n->suspect = new_suspect;
Barret Rhoden	8934ac8	2019-05-15 17:45:00 -0400	[diff] [blame]	1637	n->ignored = e->ignored;
				1638	n->unblamable = e->unblamable;
Barret Rhoden	55f808f	2019-05-15 17:44:58 -0400	[diff] [blame]	1639	n->lno = e->lno + len;
				1640	n->s_lno = e->s_lno + len;
				1641	n->num_lines = e->num_lines - len;
				1642	e->num_lines = len;
				1643	e->score = 0;
				1644	return n;
				1645	}
				1646
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1647	struct blame_line_tracker {
				1648	int is_parent;
				1649	int s_lno;
				1650	};
				1651
				1652	static int are_lines_adjacent(struct blame_line_tracker *first,
				1653	struct blame_line_tracker *second)
				1654	{
				1655	return first->is_parent == second->is_parent &&
				1656	first->s_lno + 1 == second->s_lno;
				1657	}
				1658
Barret Rhoden	a07a977	2019-06-20 12:38:19 -0400	[diff] [blame]	1659	static int scan_parent_range(struct fingerprint *p_fps,
				1660	struct fingerprint *t_fps, int t_idx,
				1661	int from, int nr_lines)
				1662	{
				1663	int sim, p_idx;
				1664	#define FINGERPRINT_FILE_THRESHOLD 10
				1665	int best_sim_val = FINGERPRINT_FILE_THRESHOLD;
				1666	int best_sim_idx = -1;
				1667
				1668	for (p_idx = from; p_idx < from + nr_lines; p_idx++) {
				1669	sim = fingerprint_similarity(&t_fps[t_idx], &p_fps[p_idx]);
				1670	if (sim < best_sim_val)
				1671	continue;
				1672	/* Break ties with the closest-to-target line number */
				1673	if (sim == best_sim_val && best_sim_idx != -1 &&
				1674	abs(best_sim_idx - t_idx) < abs(p_idx - t_idx))
				1675	continue;
				1676	best_sim_val = sim;
				1677	best_sim_idx = p_idx;
				1678	}
				1679	return best_sim_idx;
				1680	}
				1681
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1682	/*
Barret Rhoden	a07a977	2019-06-20 12:38:19 -0400	[diff] [blame]	1683	* The first pass checks the blame entry (from the target) against the parent's
				1684	* diff chunk. If that fails for a line, the second pass tries to match that
				1685	* line to any part of parent file. That catches cases where a change was
				1686	* broken into two chunks by 'context.'
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1687	*/
				1688	static void guess_line_blames(struct blame_origin *parent,
				1689	struct blame_origin *target,
				1690	int tlno, int offset, int same, int parent_len,
				1691	struct blame_line_tracker *line_blames)
				1692	{
				1693	int i, best_idx, target_idx;
				1694	int parent_slno = tlno + offset;
Barret Rhoden	a07a977	2019-06-20 12:38:19 -0400	[diff] [blame]	1695	int *fuzzy_matches;
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1696
Barret Rhoden	a07a977	2019-06-20 12:38:19 -0400	[diff] [blame]	1697	fuzzy_matches = fuzzy_find_matching_lines(parent, target,
				1698	tlno, parent_slno, same,
				1699	parent_len);
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1700	for (i = 0; i < same - tlno; i++) {
				1701	target_idx = tlno + i;
Barret Rhoden	a07a977	2019-06-20 12:38:19 -0400	[diff] [blame]	1702	if (fuzzy_matches && fuzzy_matches[i] >= 0) {
				1703	best_idx = fuzzy_matches[i];
				1704	} else {
				1705	best_idx = scan_parent_range(parent->fingerprints,
				1706	target->fingerprints,
				1707	target_idx, 0,
				1708	parent->num_lines);
				1709	}
				1710	if (best_idx >= 0) {
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1711	line_blames[i].is_parent = 1;
				1712	line_blames[i].s_lno = best_idx;
				1713	} else {
				1714	line_blames[i].is_parent = 0;
				1715	line_blames[i].s_lno = target_idx;
				1716	}
				1717	}
Barret Rhoden	a07a977	2019-06-20 12:38:19 -0400	[diff] [blame]	1718	free(fuzzy_matches);
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1719	}
				1720
				1721	/*
				1722	* This decides which parts of a blame entry go to the parent (added to the
				1723	* ignoredp list) and which stay with the target (added to the diffp list). The
				1724	* actual decision was made in a separate heuristic function, and those answers
				1725	* for the lines in 'e' are in line_blames. This consumes e, essentially
				1726	* putting it on a list.
				1727	*
				1728	* Note that the blame entries on the ignoredp list are not necessarily sorted
				1729	* with respect to the parent's line numbers yet.
				1730	*/
				1731	static void ignore_blame_entry(struct blame_entry *e,
				1732	struct blame_origin *parent,
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1733	struct blame_entry **diffp,
				1734	struct blame_entry **ignoredp,
				1735	struct blame_line_tracker *line_blames)
				1736	{
				1737	int entry_len, nr_lines, i;
				1738
				1739	/*
				1740	* We carve new entries off the front of e. Each entry comes from a
				1741	* contiguous chunk of lines: adjacent lines from the same origin
				1742	* (either the parent or the target).
				1743	*/
				1744	entry_len = 1;
				1745	nr_lines = e->num_lines; /* e changes in the loop */
				1746	for (i = 0; i < nr_lines; i++) {
				1747	struct blame_entry *next = NULL;
				1748
				1749	/*
				1750	* We are often adjacent to the next line - only split the blame
				1751	* entry when we have to.
				1752	*/
				1753	if (i + 1 < nr_lines) {
				1754	if (are_lines_adjacent(&line_blames[i],
				1755	&line_blames[i + 1])) {
				1756	entry_len++;
				1757	continue;
				1758	}
				1759	next = split_blame_at(e, entry_len,
				1760	blame_origin_incref(e->suspect));
				1761	}
				1762	if (line_blames[i].is_parent) {
Barret Rhoden	8934ac8	2019-05-15 17:45:00 -0400	[diff] [blame]	1763	e->ignored = 1;
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1764	blame_origin_decref(e->suspect);
				1765	e->suspect = blame_origin_incref(parent);
				1766	e->s_lno = line_blames[i - entry_len + 1].s_lno;
				1767	e->next = *ignoredp;
				1768	*ignoredp = e;
				1769	} else {
Barret Rhoden	8934ac8	2019-05-15 17:45:00 -0400	[diff] [blame]	1770	e->unblamable = 1;
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1771	/* e->s_lno is already in the target's address space. */
				1772	e->next = *diffp;
				1773	*diffp = e;
				1774	}
				1775	assert(e->num_lines == entry_len);
				1776	e = next;
				1777	entry_len = 1;
				1778	}
				1779	assert(!e);
				1780	}
				1781
Barret Rhoden	55f808f	2019-05-15 17:44:58 -0400	[diff] [blame]	1782	/*
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1783	* Process one hunk from the patch between the current suspect for
				1784	* blame_entry e and its parent. This first blames any unfinished
				1785	* entries before the chunk (which is where target and parent start
				1786	* differing) on the parent, and then splits blame entries at the
				1787	* start and at the end of the difference region. Since use of -M and
				1788	* -C options may lead to overlapping/duplicate source line number
				1789	* ranges, all we can rely on from sorting/merging is the order of the
				1790	* first suspect line number.
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1791	*
				1792	* tlno: line number in the target where this chunk begins
				1793	* same: line number in the target where this chunk ends
				1794	* offset: add to tlno to get the chunk starting point in the parent
				1795	* parent_len: number of lines in the parent chunk
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1796	*/
				1797	static void blame_chunk(struct blame_entry *dstq, struct blame_entry *srcq,
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1798	int tlno, int offset, int same, int parent_len,
				1799	struct blame_origin *parent,
				1800	struct blame_origin *target, int ignore_diffs)
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1801	{
				1802	struct blame_entry e = *srcq;
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1803	struct blame_entry samep = NULL, diffp = NULL, *ignoredp = NULL;
				1804	struct blame_line_tracker *line_blames = NULL;
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1805
				1806	while (e && e->s_lno < tlno) {
				1807	struct blame_entry *next = e->next;
				1808	/*
				1809	* current record starts before differing portion. If
				1810	* it reaches into it, we need to split it up and
				1811	* examine the second part separately.
				1812	*/
				1813	if (e->s_lno + e->num_lines > tlno) {
				1814	/* Move second half to a new record */
Barret Rhoden	55f808f	2019-05-15 17:44:58 -0400	[diff] [blame]	1815	struct blame_entry *n;
				1816
				1817	n = split_blame_at(e, tlno - e->s_lno, e->suspect);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1818	/* Push new record to diffp */
				1819	n->next = diffp;
				1820	diffp = n;
				1821	} else
				1822	blame_origin_decref(e->suspect);
				1823	/* Pass blame for everything before the differing
				1824	* chunk to the parent */
				1825	e->suspect = blame_origin_incref(parent);
				1826	e->s_lno += offset;
				1827	e->next = samep;
				1828	samep = e;
				1829	e = next;
				1830	}
				1831	/*
				1832	* As we don't know how much of a common stretch after this
				1833	* diff will occur, the currently blamed parts are all that we
				1834	* can assign to the parent for now.
				1835	*/
				1836
				1837	if (samep) {
				1838	dstq = reverse_blame(samep, dstq);
				1839	*dstq = &samep->next;
				1840	}
				1841	/*
				1842	* Prepend the split off portions: everything after e starts
				1843	* after the blameable portion.
				1844	*/
				1845	e = reverse_blame(diffp, e);
				1846
				1847	/*
				1848	* Now retain records on the target while parts are different
				1849	* from the parent.
				1850	*/
				1851	samep = NULL;
				1852	diffp = NULL;
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1853
				1854	if (ignore_diffs && same - tlno > 0) {
				1855	line_blames = xcalloc(sizeof(struct blame_line_tracker),
				1856	same - tlno);
				1857	guess_line_blames(parent, target, tlno, offset, same,
				1858	parent_len, line_blames);
				1859	}
				1860
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1861	while (e && e->s_lno < same) {
				1862	struct blame_entry *next = e->next;
				1863
				1864	/*
				1865	* If current record extends into sameness, need to split.
				1866	*/
				1867	if (e->s_lno + e->num_lines > same) {
				1868	/*
				1869	* Move second half to a new record to be
				1870	* processed by later chunks
				1871	*/
Barret Rhoden	55f808f	2019-05-15 17:44:58 -0400	[diff] [blame]	1872	struct blame_entry *n;
				1873
				1874	n = split_blame_at(e, same - e->s_lno,
				1875	blame_origin_incref(e->suspect));
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1876	/* Push new record to samep */
				1877	n->next = samep;
				1878	samep = n;
				1879	}
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1880	if (ignore_diffs) {
Jeff King	07a54dc	2019-06-28 02:24:57 -0400	[diff] [blame]	1881	ignore_blame_entry(e, parent, &diffp, &ignoredp,
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1882	line_blames + e->s_lno - tlno);
				1883	} else {
				1884	e->next = diffp;
				1885	diffp = e;
				1886	}
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1887	e = next;
				1888	}
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1889	free(line_blames);
				1890	if (ignoredp) {
				1891	/*
				1892	* Note ignoredp is not sorted yet, and thus neither is dstq.
				1893	* That list must be sorted before we queue_blames(). We defer
				1894	* sorting until after all diff hunks are processed, so that
				1895	* guess_line_blames() can pick any line in the parent. The
				1896	* slight drawback is that we end up sorting all blame entries
				1897	* passed to the parent, including those that are unrelated to
				1898	* changes made by the ignored commit.
				1899	*/
				1900	dstq = reverse_blame(ignoredp, dstq);
				1901	*dstq = &ignoredp->next;
				1902	}
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1903	**srcq = reverse_blame(diffp, reverse_blame(samep, e));
				1904	/* Move across elements that are in the unblamable portion */
				1905	if (diffp)
				1906	*srcq = &diffp->next;
				1907	}
				1908
				1909	struct blame_chunk_cb_data {
				1910	struct blame_origin *parent;
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1911	struct blame_origin *target;
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1912	long offset;
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1913	int ignore_diffs;
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1914	struct blame_entry **dstq;
				1915	struct blame_entry **srcq;
				1916	};
				1917
				1918	/* diff chunks are from parent to target */
				1919	static int blame_chunk_cb(long start_a, long count_a,
				1920	long start_b, long count_b, void *data)
				1921	{
				1922	struct blame_chunk_cb_data *d = data;
				1923	if (start_a - start_b != d->offset)
				1924	die("internal error in blame::blame_chunk_cb");
				1925	blame_chunk(&d->dstq, &d->srcq, start_b, start_a - start_b,
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1926	start_b + count_b, count_a, d->parent, d->target,
				1927	d->ignore_diffs);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1928	d->offset = start_a + count_a - (start_b + count_b);
				1929	return 0;
				1930	}
				1931
				1932	/*
				1933	* We are looking at the origin 'target' and aiming to pass blame
				1934	* for the lines it is suspected to its parent. Run diff to find
				1935	* which lines came from parent and pass blame for them.
				1936	*/
				1937	static void pass_blame_to_parent(struct blame_scoreboard *sb,
				1938	struct blame_origin *target,
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1939	struct blame_origin *parent, int ignore_diffs)
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1940	{
				1941	mmfile_t file_p, file_o;
				1942	struct blame_chunk_cb_data d;
				1943	struct blame_entry *newdest = NULL;
				1944
				1945	if (!target->suspects)
				1946	return; /* nothing remains for this target */
				1947
				1948	d.parent = parent;
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1949	d.target = target;
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1950	d.offset = 0;
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1951	d.ignore_diffs = ignore_diffs;
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1952	d.dstq = &newdest; d.srcq = &target->suspects;
				1953
Barret Rhoden	1fc7338	2019-05-15 17:45:01 -0400	[diff] [blame]	1954	fill_origin_blob(&sb->revs->diffopt, parent, &file_p,
				1955	&sb->num_read_blob, ignore_diffs);
				1956	fill_origin_blob(&sb->revs->diffopt, target, &file_o,
				1957	&sb->num_read_blob, ignore_diffs);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1958	sb->num_get_patch++;
				1959
				1960	if (diff_hunks(&file_p, &file_o, blame_chunk_cb, &d, sb->xdl_opts))
				1961	die("unable to generate diff (%s -> %s)",
				1962	oid_to_hex(&parent->commit->object.oid),
				1963	oid_to_hex(&target->commit->object.oid));
				1964	/* The rest are the same as the parent */
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1965	blame_chunk(&d.dstq, &d.srcq, INT_MAX, d.offset, INT_MAX, 0,
				1966	parent, target, 0);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1967	*d.dstq = NULL;
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	1968	if (ignore_diffs)
				1969	newdest = llist_mergesort(newdest, get_next_blame,
				1970	set_next_blame,
				1971	compare_blame_suspect);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	1972	queue_blames(sb, parent, newdest);
				1973
				1974	return;
				1975	}
				1976
				1977	/*
				1978	* The lines in blame_entry after splitting blames many times can become
				1979	* very small and trivial, and at some point it becomes pointless to
				1980	* blame the parents. E.g. "\t\t}\n\t}\n\n" appears everywhere in any
				1981	* ordinary C program, and it is not worth to say it was copied from
				1982	* totally unrelated file in the parent.
				1983	*
				1984	* Compute how trivial the lines in the blame_entry are.
				1985	*/
				1986	unsigned blame_entry_score(struct blame_scoreboard sb, struct blame_entry e)
				1987	{
				1988	unsigned score;
				1989	const char cp, ep;
				1990
				1991	if (e->score)
				1992	return e->score;
				1993
				1994	score = 1;
				1995	cp = blame_nth_line(sb, e->lno);
				1996	ep = blame_nth_line(sb, e->lno + e->num_lines);
				1997	while (cp < ep) {
				1998	unsigned ch = ((unsigned char )cp);
				1999	if (isalnum(ch))
				2000	score++;
				2001	cp++;
				2002	}
				2003	e->score = score;
				2004	return score;
				2005	}
				2006
				2007	/*
Brandon Williams	abeacb2	2018-02-14 10:59:25 -0800	[diff] [blame]	2008	* best_so_far[] and potential[] are both a split of an existing blame_entry
				2009	* that passes blame to the parent. Maintain best_so_far the best split so
				2010	* far, by comparing potential and best_so_far and copying potential into
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2011	* bst_so_far as needed.
				2012	*/
				2013	static void copy_split_if_better(struct blame_scoreboard *sb,
				2014	struct blame_entry *best_so_far,
Brandon Williams	abeacb2	2018-02-14 10:59:25 -0800	[diff] [blame]	2015	struct blame_entry *potential)
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2016	{
				2017	int i;
				2018
Brandon Williams	abeacb2	2018-02-14 10:59:25 -0800	[diff] [blame]	2019	if (!potential[1].suspect)
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2020	return;
				2021	if (best_so_far[1].suspect) {
Brandon Williams	abeacb2	2018-02-14 10:59:25 -0800	[diff] [blame]	2022	if (blame_entry_score(sb, &potential[1]) <
				2023	blame_entry_score(sb, &best_so_far[1]))
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2024	return;
				2025	}
				2026
				2027	for (i = 0; i < 3; i++)
Brandon Williams	abeacb2	2018-02-14 10:59:25 -0800	[diff] [blame]	2028	blame_origin_incref(potential[i].suspect);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2029	decref_split(best_so_far);
Brandon Williams	abeacb2	2018-02-14 10:59:25 -0800	[diff] [blame]	2030	memcpy(best_so_far, potential, sizeof(struct blame_entry[3]));
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2031	}
				2032
				2033	/*
				2034	* We are looking at a part of the final image represented by
				2035	* ent (tlno and same are offset by ent->s_lno).
				2036	* tlno is where we are looking at in the final image.
				2037	* up to (but not including) same match preimage.
				2038	* plno is where we are looking at in the preimage.
				2039	*
				2040	* <-------------- final image ---------------------->
				2041	* <------ent------>
				2042	* ^tlno ^same
				2043	* <---------preimage----->
				2044	* ^plno
				2045	*
				2046	* All line numbers are 0-based.
				2047	*/
				2048	static void handle_split(struct blame_scoreboard *sb,
				2049	struct blame_entry *ent,
				2050	int tlno, int plno, int same,
				2051	struct blame_origin *parent,
				2052	struct blame_entry *split)
				2053	{
				2054	if (ent->num_lines <= tlno)
				2055	return;
				2056	if (tlno < same) {
Brandon Williams	abeacb2	2018-02-14 10:59:25 -0800	[diff] [blame]	2057	struct blame_entry potential[3];
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2058	tlno += ent->s_lno;
				2059	same += ent->s_lno;
Brandon Williams	abeacb2	2018-02-14 10:59:25 -0800	[diff] [blame]	2060	split_overlap(potential, ent, tlno, plno, same, parent);
				2061	copy_split_if_better(sb, split, potential);
				2062	decref_split(potential);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2063	}
				2064	}
				2065
				2066	struct handle_split_cb_data {
				2067	struct blame_scoreboard *sb;
				2068	struct blame_entry *ent;
				2069	struct blame_origin *parent;
				2070	struct blame_entry *split;
				2071	long plno;
				2072	long tlno;
				2073	};
				2074
				2075	static int handle_split_cb(long start_a, long count_a,
				2076	long start_b, long count_b, void *data)
				2077	{
				2078	struct handle_split_cb_data *d = data;
				2079	handle_split(d->sb, d->ent, d->tlno, d->plno, start_b, d->parent,
				2080	d->split);
				2081	d->plno = start_a + count_a;
				2082	d->tlno = start_b + count_b;
				2083	return 0;
				2084	}
				2085
				2086	/*
				2087	* Find the lines from parent that are the same as ent so that
				2088	* we can pass blames to it. file_p has the blob contents for
				2089	* the parent.
				2090	*/
				2091	static void find_copy_in_blob(struct blame_scoreboard *sb,
				2092	struct blame_entry *ent,
				2093	struct blame_origin *parent,
				2094	struct blame_entry *split,
				2095	mmfile_t *file_p)
				2096	{
				2097	const char *cp;
				2098	mmfile_t file_o;
				2099	struct handle_split_cb_data d;
				2100
				2101	memset(&d, 0, sizeof(d));
				2102	d.sb = sb; d.ent = ent; d.parent = parent; d.split = split;
				2103	/*
				2104	* Prepare mmfile that contains only the lines in ent.
				2105	*/
				2106	cp = blame_nth_line(sb, ent->lno);
				2107	file_o.ptr = (char *) cp;
				2108	file_o.size = blame_nth_line(sb, ent->lno + ent->num_lines) - cp;
				2109
				2110	/*
				2111	* file_o is a part of final image we are annotating.
				2112	* file_p partially may match that image.
				2113	*/
				2114	memset(split, 0, sizeof(struct blame_entry [3]));
				2115	if (diff_hunks(file_p, &file_o, handle_split_cb, &d, sb->xdl_opts))
				2116	die("unable to generate diff (%s)",
				2117	oid_to_hex(&parent->commit->object.oid));
				2118	/* remainder, if any, all match the preimage */
				2119	handle_split(sb, ent, d.tlno, d.plno, ent->num_lines, parent, split);
				2120	}
				2121
				2122	/* Move all blame entries from list *source that have a score smaller
				2123	* than score_min to the front of list *small.
				2124	* Returns a pointer to the link pointing to the old head of the small list.
				2125	*/
				2126
				2127	static struct blame_entry *filter_small(struct blame_scoreboard sb,
				2128	struct blame_entry **small,
				2129	struct blame_entry **source,
				2130	unsigned score_min)
				2131	{
				2132	struct blame_entry p = source;
				2133	struct blame_entry oldsmall = small;
				2134	while (p) {
				2135	if (blame_entry_score(sb, p) <= score_min) {
				2136	*small = p;
				2137	small = &p->next;
				2138	p = *small;
				2139	} else {
				2140	*source = p;
				2141	source = &p->next;
				2142	p = *source;
				2143	}
				2144	}
				2145	*small = oldsmall;
				2146	*source = NULL;
				2147	return small;
				2148	}
				2149
				2150	/*
				2151	* See if lines currently target is suspected for can be attributed to
				2152	* parent.
				2153	*/
				2154	static void find_move_in_parent(struct blame_scoreboard *sb,
				2155	struct blame_entry ***blamed,
				2156	struct blame_entry **toosmall,
				2157	struct blame_origin *target,
				2158	struct blame_origin *parent)
				2159	{
				2160	struct blame_entry *e, split[3];
				2161	struct blame_entry *unblamed = target->suspects;
				2162	struct blame_entry *leftover = NULL;
				2163	mmfile_t file_p;
				2164
				2165	if (!unblamed)
				2166	return; /* nothing remains for this target */
				2167
Barret Rhoden	1fc7338	2019-05-15 17:45:01 -0400	[diff] [blame]	2168	fill_origin_blob(&sb->revs->diffopt, parent, &file_p,
				2169	&sb->num_read_blob, 0);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2170	if (!file_p.ptr)
				2171	return;
				2172
				2173	/* At each iteration, unblamed has a NULL-terminated list of
				2174	* entries that have not yet been tested for blame. leftover
				2175	* contains the reversed list of entries that have been tested
				2176	* without being assignable to the parent.
				2177	*/
				2178	do {
				2179	struct blame_entry **unblamedtail = &unblamed;
				2180	struct blame_entry *next;
				2181	for (e = unblamed; e; e = next) {
				2182	next = e->next;
				2183	find_copy_in_blob(sb, e, parent, split, &file_p);
				2184	if (split[1].suspect &&
				2185	sb->move_score < blame_entry_score(sb, &split[1])) {
				2186	split_blame(blamed, &unblamedtail, split, e);
				2187	} else {
				2188	e->next = leftover;
				2189	leftover = e;
				2190	}
				2191	decref_split(split);
				2192	}
				2193	*unblamedtail = NULL;
				2194	toosmall = filter_small(sb, toosmall, &unblamed, sb->move_score);
				2195	} while (unblamed);
				2196	target->suspects = reverse_blame(leftover, NULL);
				2197	}
				2198
				2199	struct blame_list {
				2200	struct blame_entry *ent;
				2201	struct blame_entry split[3];
				2202	};
				2203
				2204	/*
				2205	* Count the number of entries the target is suspected for,
				2206	* and prepare a list of entry and the best split.
				2207	*/
				2208	static struct blame_list setup_blame_list(struct blame_entry unblamed,
				2209	int *num_ents_p)
				2210	{
				2211	struct blame_entry *e;
				2212	int num_ents, i;
				2213	struct blame_list *blame_list = NULL;
				2214
				2215	for (e = unblamed, num_ents = 0; e; e = e->next)
				2216	num_ents++;
				2217	if (num_ents) {
				2218	blame_list = xcalloc(num_ents, sizeof(struct blame_list));
				2219	for (e = unblamed, i = 0; e; e = e->next)
				2220	blame_list[i++].ent = e;
				2221	}
				2222	*num_ents_p = num_ents;
				2223	return blame_list;
				2224	}
				2225
				2226	/*
				2227	* For lines target is suspected for, see if we can find code movement
				2228	* across file boundary from the parent commit. porigin is the path
				2229	* in the parent we already tried.
				2230	*/
				2231	static void find_copy_in_parent(struct blame_scoreboard *sb,
				2232	struct blame_entry ***blamed,
				2233	struct blame_entry **toosmall,
				2234	struct blame_origin *target,
				2235	struct commit *parent,
				2236	struct blame_origin *porigin,
				2237	int opt)
				2238	{
				2239	struct diff_options diff_opts;
				2240	int i, j;
				2241	struct blame_list *blame_list;
				2242	int num_ents;
				2243	struct blame_entry *unblamed = target->suspects;
				2244	struct blame_entry *leftover = NULL;
				2245
				2246	if (!unblamed)
				2247	return; /* nothing remains for this target */
				2248
Nguyễn Thái Ngọc Duy	e675765	2018-09-21 17:57:24 +0200	[diff] [blame]	2249	repo_diff_setup(sb->repo, &diff_opts);
Brandon Williams	0d1e0e7	2017-10-31 11:19:11 -0700	[diff] [blame]	2250	diff_opts.flags.recursive = 1;
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2251	diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT;
				2252
				2253	diff_setup_done(&diff_opts);
				2254
				2255	/* Try "find copies harder" on new path if requested;
				2256	* we do not want to use diffcore_rename() actually to
				2257	* match things up; find_copies_harder is set only to
Junio C Hamano	a6f38c1	2017-06-19 12:38:44 -0700	[diff] [blame]	2258	* force diff_tree_oid() to feed all filepairs to diff_queue,
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2259	* and this code needs to be after diff_setup_done(), which
				2260	* usually makes find-copies-harder imply copy detection.
				2261	*/
				2262	if ((opt & PICKAXE_BLAME_COPY_HARDEST)
				2263	\|\| ((opt & PICKAXE_BLAME_COPY_HARDER)
				2264	&& (!porigin \|\| strcmp(target->path, porigin->path))))
Brandon Williams	0d1e0e7	2017-10-31 11:19:11 -0700	[diff] [blame]	2265	diff_opts.flags.find_copies_harder = 1;
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2266
				2267	if (is_null_oid(&target->commit->object.oid))
Derrick Stolee	2e27bd7	2018-04-06 19:09:38 +0000	[diff] [blame]	2268	do_diff_cache(get_commit_tree_oid(parent), &diff_opts);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2269	else
Derrick Stolee	2e27bd7	2018-04-06 19:09:38 +0000	[diff] [blame]	2270	diff_tree_oid(get_commit_tree_oid(parent),
				2271	get_commit_tree_oid(target->commit),
Junio C Hamano	a6f38c1	2017-06-19 12:38:44 -0700	[diff] [blame]	2272	"", &diff_opts);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2273
Brandon Williams	0d1e0e7	2017-10-31 11:19:11 -0700	[diff] [blame]	2274	if (!diff_opts.flags.find_copies_harder)
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2275	diffcore_std(&diff_opts);
				2276
				2277	do {
				2278	struct blame_entry **unblamedtail = &unblamed;
				2279	blame_list = setup_blame_list(unblamed, &num_ents);
				2280
				2281	for (i = 0; i < diff_queued_diff.nr; i++) {
				2282	struct diff_filepair *p = diff_queued_diff.queue[i];
				2283	struct blame_origin *norigin;
				2284	mmfile_t file_p;
Brandon Williams	abeacb2	2018-02-14 10:59:25 -0800	[diff] [blame]	2285	struct blame_entry potential[3];
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2286
				2287	if (!DIFF_FILE_VALID(p->one))
				2288	continue; /* does not exist in parent */
				2289	if (S_ISGITLINK(p->one->mode))
				2290	continue; /* ignore git links */
				2291	if (porigin && !strcmp(p->one->path, porigin->path))
				2292	/* find_move already dealt with this path */
				2293	continue;
				2294
				2295	norigin = get_origin(parent, p->one->path);
				2296	oidcpy(&norigin->blob_oid, &p->one->oid);
				2297	norigin->mode = p->one->mode;
Barret Rhoden	1fc7338	2019-05-15 17:45:01 -0400	[diff] [blame]	2298	fill_origin_blob(&sb->revs->diffopt, norigin, &file_p,
				2299	&sb->num_read_blob, 0);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2300	if (!file_p.ptr)
				2301	continue;
				2302
				2303	for (j = 0; j < num_ents; j++) {
				2304	find_copy_in_blob(sb, blame_list[j].ent,
Brandon Williams	abeacb2	2018-02-14 10:59:25 -0800	[diff] [blame]	2305	norigin, potential, &file_p);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2306	copy_split_if_better(sb, blame_list[j].split,
Brandon Williams	abeacb2	2018-02-14 10:59:25 -0800	[diff] [blame]	2307	potential);
				2308	decref_split(potential);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2309	}
				2310	blame_origin_decref(norigin);
				2311	}
				2312
				2313	for (j = 0; j < num_ents; j++) {
				2314	struct blame_entry *split = blame_list[j].split;
				2315	if (split[1].suspect &&
				2316	sb->copy_score < blame_entry_score(sb, &split[1])) {
				2317	split_blame(blamed, &unblamedtail, split,
				2318	blame_list[j].ent);
				2319	} else {
				2320	blame_list[j].ent->next = leftover;
				2321	leftover = blame_list[j].ent;
				2322	}
				2323	decref_split(split);
				2324	}
				2325	free(blame_list);
				2326	*unblamedtail = NULL;
				2327	toosmall = filter_small(sb, toosmall, &unblamed, sb->copy_score);
				2328	} while (unblamed);
				2329	target->suspects = reverse_blame(leftover, NULL);
				2330	diff_flush(&diff_opts);
				2331	clear_pathspec(&diff_opts.pathspec);
				2332	}
				2333
				2334	/*
				2335	* The blobs of origin and porigin exactly match, so everything
				2336	* origin is suspected for can be blamed on the parent.
				2337	*/
				2338	static void pass_whole_blame(struct blame_scoreboard *sb,
				2339	struct blame_origin origin, struct blame_origin porigin)
				2340	{
				2341	struct blame_entry e, suspects;
				2342
				2343	if (!porigin->file.ptr && origin->file.ptr) {
				2344	/* Steal its file */
				2345	porigin->file = origin->file;
				2346	origin->file.ptr = NULL;
				2347	}
				2348	suspects = origin->suspects;
				2349	origin->suspects = NULL;
				2350	for (e = suspects; e; e = e->next) {
				2351	blame_origin_incref(porigin);
				2352	blame_origin_decref(e->suspect);
				2353	e->suspect = porigin;
				2354	}
				2355	queue_blames(sb, porigin, suspects);
				2356	}
				2357
				2358	/*
				2359	* We pass blame from the current commit to its parents. We keep saying
				2360	* "parent" (and "porigin"), but what we mean is to find scapegoat to
				2361	* exonerate ourselves.
				2362	*/
				2363	static struct commit_list first_scapegoat(struct rev_info revs, struct commit *commit,
				2364	int reverse)
				2365	{
				2366	if (!reverse) {
				2367	if (revs->first_parent_only &&
				2368	commit->parents &&
				2369	commit->parents->next) {
				2370	free_commit_list(commit->parents->next);
				2371	commit->parents->next = NULL;
				2372	}
				2373	return commit->parents;
				2374	}
				2375	return lookup_decoration(&revs->children, &commit->object);
				2376	}
				2377
				2378	static int num_scapegoats(struct rev_info revs, struct commit commit, int reverse)
				2379	{
				2380	struct commit_list *l = first_scapegoat(revs, commit, reverse);
				2381	return commit_list_count(l);
				2382	}
				2383
				2384	/* Distribute collected unsorted blames to the respected sorted lists
				2385	* in the various origins.
				2386	*/
				2387	static void distribute_blame(struct blame_scoreboard sb, struct blame_entry blamed)
				2388	{
				2389	blamed = llist_mergesort(blamed, get_next_blame, set_next_blame,
				2390	compare_blame_suspect);
				2391	while (blamed)
				2392	{
				2393	struct blame_origin *porigin = blamed->suspect;
				2394	struct blame_entry *suspects = NULL;
				2395	do {
				2396	struct blame_entry *next = blamed->next;
				2397	blamed->next = suspects;
				2398	suspects = blamed;
				2399	blamed = next;
				2400	} while (blamed && blamed->suspect == porigin);
				2401	suspects = reverse_blame(suspects, NULL);
				2402	queue_blames(sb, porigin, suspects);
				2403	}
				2404	}
				2405
				2406	#define MAXSG 16
				2407
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	2408	typedef struct blame_origin (blame_find_alg)(struct repository *,
				2409	struct commit *,
				2410	struct blame_origin *,
				2411	struct blame_bloom_data *);
				2412
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2413	static void pass_blame(struct blame_scoreboard sb, struct blame_origin origin, int opt)
				2414	{
				2415	struct rev_info *revs = sb->revs;
				2416	int i, pass, num_sg;
				2417	struct commit *commit = origin->commit;
				2418	struct commit_list *sg;
				2419	struct blame_origin *sg_buf[MAXSG];
				2420	struct blame_origin porigin, *sg_origin = sg_buf;
				2421	struct blame_entry *toosmall = NULL;
				2422	struct blame_entry blames, *blametail = &blames;
				2423
				2424	num_sg = num_scapegoats(revs, commit, sb->reverse);
				2425	if (!num_sg)
				2426	goto finish;
				2427	else if (num_sg < ARRAY_SIZE(sg_buf))
				2428	memset(sg_buf, 0, sizeof(sg_buf));
				2429	else
				2430	sg_origin = xcalloc(num_sg, sizeof(*sg_origin));
				2431
				2432	/*
				2433	* The first pass looks for unrenamed path to optimize for
				2434	* common cases, then we look for renames in the second pass.
				2435	*/
				2436	for (pass = 0; pass < 2 - sb->no_whole_file_rename; pass++) {
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	2437	blame_find_alg find = pass ? find_rename : find_origin;
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2438
				2439	for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);
				2440	i < num_sg && sg;
				2441	sg = sg->next, i++) {
				2442	struct commit *p = sg->item;
				2443	int j, same;
				2444
				2445	if (sg_origin[i])
				2446	continue;
				2447	if (parse_commit(p))
				2448	continue;
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	2449	porigin = find(sb->repo, p, origin, sb->bloom_data);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2450	if (!porigin)
				2451	continue;
Jeff King	4a7e27e	2018-08-28 17:22:40 -0400	[diff] [blame]	2452	if (oideq(&porigin->blob_oid, &origin->blob_oid)) {
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2453	pass_whole_blame(sb, origin, porigin);
				2454	blame_origin_decref(porigin);
				2455	goto finish;
				2456	}
				2457	for (j = same = 0; j < i; j++)
				2458	if (sg_origin[j] &&
Jeff King	4a7e27e	2018-08-28 17:22:40 -0400	[diff] [blame]	2459	oideq(&sg_origin[j]->blob_oid, &porigin->blob_oid)) {
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2460	same = 1;
				2461	break;
				2462	}
				2463	if (!same)
				2464	sg_origin[i] = porigin;
				2465	else
				2466	blame_origin_decref(porigin);
				2467	}
				2468	}
				2469
				2470	sb->num_commits++;
				2471	for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);
				2472	i < num_sg && sg;
				2473	sg = sg->next, i++) {
				2474	struct blame_origin *porigin = sg_origin[i];
				2475	if (!porigin)
				2476	continue;
				2477	if (!origin->previous) {
				2478	blame_origin_incref(porigin);
				2479	origin->previous = porigin;
				2480	}
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	2481	pass_blame_to_parent(sb, origin, porigin, 0);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2482	if (!origin->suspects)
				2483	goto finish;
				2484	}
				2485
				2486	/*
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	2487	* Pass remaining suspects for ignored commits to their parents.
				2488	*/
				2489	if (oidset_contains(&sb->ignore_list, &commit->object.oid)) {
				2490	for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);
				2491	i < num_sg && sg;
				2492	sg = sg->next, i++) {
				2493	struct blame_origin *porigin = sg_origin[i];
				2494
				2495	if (!porigin)
				2496	continue;
				2497	pass_blame_to_parent(sb, origin, porigin, 1);
Barret Rhoden	a07a977	2019-06-20 12:38:19 -0400	[diff] [blame]	2498	/*
				2499	* Preemptively drop porigin so we can refresh the
				2500	* fingerprints if we use the parent again, which can
				2501	* occur if you ignore back-to-back commits.
				2502	*/
				2503	drop_origin_blob(porigin);
Barret Rhoden	ae3f36d	2019-05-15 17:44:59 -0400	[diff] [blame]	2504	if (!origin->suspects)
				2505	goto finish;
				2506	}
				2507	}
				2508
				2509	/*
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2510	* Optionally find moves in parents' files.
				2511	*/
				2512	if (opt & PICKAXE_BLAME_MOVE) {
				2513	filter_small(sb, &toosmall, &origin->suspects, sb->move_score);
				2514	if (origin->suspects) {
				2515	for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);
				2516	i < num_sg && sg;
				2517	sg = sg->next, i++) {
				2518	struct blame_origin *porigin = sg_origin[i];
				2519	if (!porigin)
				2520	continue;
				2521	find_move_in_parent(sb, &blametail, &toosmall, origin, porigin);
				2522	if (!origin->suspects)
				2523	break;
				2524	}
				2525	}
				2526	}
				2527
				2528	/*
				2529	* Optionally find copies from parents' files.
				2530	*/
				2531	if (opt & PICKAXE_BLAME_COPY) {
				2532	if (sb->copy_score > sb->move_score)
				2533	filter_small(sb, &toosmall, &origin->suspects, sb->copy_score);
				2534	else if (sb->copy_score < sb->move_score) {
				2535	origin->suspects = blame_merge(origin->suspects, toosmall);
				2536	toosmall = NULL;
				2537	filter_small(sb, &toosmall, &origin->suspects, sb->copy_score);
				2538	}
				2539	if (!origin->suspects)
				2540	goto finish;
				2541
				2542	for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);
				2543	i < num_sg && sg;
				2544	sg = sg->next, i++) {
				2545	struct blame_origin *porigin = sg_origin[i];
				2546	find_copy_in_parent(sb, &blametail, &toosmall,
				2547	origin, sg->item, porigin, opt);
				2548	if (!origin->suspects)
				2549	goto finish;
				2550	}
				2551	}
				2552
				2553	finish:
				2554	*blametail = NULL;
				2555	distribute_blame(sb, blames);
				2556	/*
				2557	* prepend toosmall to origin->suspects
				2558	*
				2559	* There is no point in sorting: this ends up on a big
				2560	* unsorted list in the caller anyway.
				2561	*/
				2562	if (toosmall) {
				2563	struct blame_entry **tail = &toosmall;
				2564	while (*tail)
				2565	tail = &(*tail)->next;
				2566	*tail = origin->suspects;
				2567	origin->suspects = toosmall;
				2568	}
				2569	for (i = 0; i < num_sg; i++) {
				2570	if (sg_origin[i]) {
David Kastrup	f892014	2019-04-02 13:56:25 +0200	[diff] [blame]	2571	if (!sg_origin[i]->suspects)
				2572	drop_origin_blob(sg_origin[i]);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2573	blame_origin_decref(sg_origin[i]);
				2574	}
				2575	}
				2576	drop_origin_blob(origin);
				2577	if (sg_buf != sg_origin)
				2578	free(sg_origin);
				2579	}
				2580
				2581	/*
				2582	* The main loop -- while we have blobs with lines whose true origin
				2583	* is still unknown, pick one blob, and allow its lines to pass blames
				2584	* to its parents. */
				2585	void assign_blame(struct blame_scoreboard *sb, int opt)
				2586	{
				2587	struct rev_info *revs = sb->revs;
				2588	struct commit *commit = prio_queue_get(&sb->commits);
				2589
				2590	while (commit) {
				2591	struct blame_entry *ent;
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	2592	struct blame_origin *suspect = get_blame_suspects(commit);
Jeff Smith	b543bb1	2017-05-24 00:15:35 -0500	[diff] [blame]	2593
				2594	/* find one suspect to break down */
				2595	while (suspect && !suspect->suspects)
				2596	suspect = suspect->next;
				2597
				2598	if (!suspect) {
				2599	commit = prio_queue_get(&sb->commits);
				2600	continue;
				2601	}
				2602
				2603	assert(commit == suspect->commit);
				2604
				2605	/*
				2606	* We will use this suspect later in the loop,
				2607	* so hold onto it in the meantime.
				2608	*/
				2609	blame_origin_incref(suspect);
				2610	parse_commit(commit);
				2611	if (sb->reverse \|\|
				2612	(!(commit->object.flags & UNINTERESTING) &&
				2613	!(revs->max_age != -1 && commit->date < revs->max_age)))
				2614	pass_blame(sb, suspect, opt);
				2615	else {
				2616	commit->object.flags \|= UNINTERESTING;
				2617	if (commit->object.parsed)
				2618	mark_parents_uninteresting(commit);
				2619	}
				2620	/* treat root commit as boundary */
				2621	if (!commit->parents && !sb->show_root)
				2622	commit->object.flags \|= UNINTERESTING;
				2623
				2624	/* Take responsibility for the remaining entries */
				2625	ent = suspect->suspects;
				2626	if (ent) {
				2627	suspect->guilty = 1;
				2628	for (;;) {
				2629	struct blame_entry *next = ent->next;
				2630	if (sb->found_guilty_entry)
				2631	sb->found_guilty_entry(ent, sb->found_guilty_entry_data);
				2632	if (next) {
				2633	ent = next;
				2634	continue;
				2635	}
				2636	ent->next = sb->ent;
				2637	sb->ent = suspect->suspects;
				2638	suspect->suspects = NULL;
				2639	break;
				2640	}
				2641	}
				2642	blame_origin_decref(suspect);
				2643
				2644	if (sb->debug) /* sanity */
				2645	sanity_check_refcnt(sb);
				2646	}
				2647	}
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2648
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2649	/*
				2650	* To allow quick access to the contents of nth line in the
				2651	* final image, prepare an index in the scoreboard.
				2652	*/
				2653	static int prepare_lines(struct blame_scoreboard *sb)
				2654	{
Barret Rhoden	1fc7338	2019-05-15 17:45:01 -0400	[diff] [blame]	2655	sb->num_lines = find_line_starts(&sb->lineno, sb->final_buf,
				2656	sb->final_buf_size);
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2657	return sb->num_lines;
				2658	}
				2659
				2660	static struct commit find_single_final(struct rev_info revs,
				2661	const char **name_p)
				2662	{
				2663	int i;
				2664	struct commit *found = NULL;
				2665	const char *name = NULL;
				2666
				2667	for (i = 0; i < revs->pending.nr; i++) {
				2668	struct object *obj = revs->pending.objects[i].item;
				2669	if (obj->flags & UNINTERESTING)
				2670	continue;
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	2671	obj = deref_tag(revs->repo, obj, NULL, 0);
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2672	if (obj->type != OBJ_COMMIT)
				2673	die("Non commit %s?", revs->pending.objects[i].name);
				2674	if (found)
				2675	die("More than one commit to dig from %s and %s?",
				2676	revs->pending.objects[i].name, name);
				2677	found = (struct commit *)obj;
				2678	name = revs->pending.objects[i].name;
				2679	}
				2680	if (name_p)
SZEDER Gábor	9e7d8a9	2017-07-24 23:15:50 +0200	[diff] [blame]	2681	*name_p = xstrdup_or_null(name);
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2682	return found;
				2683	}
				2684
				2685	static struct commit dwim_reverse_initial(struct rev_info revs,
				2686	const char **name_p)
				2687	{
				2688	/*
				2689	* DWIM "git blame --reverse ONE -- PATH" as
				2690	* "git blame --reverse ONE..HEAD -- PATH" but only do so
				2691	* when it makes sense.
				2692	*/
				2693	struct object *obj;
				2694	struct commit *head_commit;
Junio C Hamano	583c6a2	2017-06-05 09:18:11 +0900	[diff] [blame]	2695	struct object_id head_oid;
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2696
				2697	if (revs->pending.nr != 1)
				2698	return NULL;
				2699
				2700	/* Is that sole rev a committish? */
				2701	obj = revs->pending.objects[0].item;
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	2702	obj = deref_tag(revs->repo, obj, NULL, 0);
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2703	if (obj->type != OBJ_COMMIT)
				2704	return NULL;
				2705
				2706	/* Do we have HEAD? */
brian m. carlson	49e6147	2017-10-15 22:07:09 +0000	[diff] [blame]	2707	if (!resolve_ref_unsafe("HEAD", RESOLVE_REF_READING, &head_oid, NULL))
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2708	return NULL;
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	2709	head_commit = lookup_commit_reference_gently(revs->repo,
Stefan Beller	21e1ee8	2018-06-28 18:21:57 -0700	[diff] [blame]	2710	&head_oid, 1);
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2711	if (!head_commit)
				2712	return NULL;
				2713
				2714	/* Turn "ONE" into "ONE..HEAD" then */
				2715	obj->flags \|= UNINTERESTING;
				2716	add_pending_object(revs, &head_commit->object, "HEAD");
				2717
				2718	if (name_p)
				2719	*name_p = revs->pending.objects[0].name;
				2720	return (struct commit *)obj;
				2721	}
				2722
				2723	static struct commit find_single_initial(struct rev_info revs,
				2724	const char **name_p)
				2725	{
				2726	int i;
				2727	struct commit *found = NULL;
				2728	const char *name = NULL;
				2729
				2730	/*
				2731	* There must be one and only one negative commit, and it must be
				2732	* the boundary.
				2733	*/
				2734	for (i = 0; i < revs->pending.nr; i++) {
				2735	struct object *obj = revs->pending.objects[i].item;
				2736	if (!(obj->flags & UNINTERESTING))
				2737	continue;
Nguyễn Thái Ngọc Duy	fb998ea	2018-11-10 06:48:58 +0100	[diff] [blame]	2738	obj = deref_tag(revs->repo, obj, NULL, 0);
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2739	if (obj->type != OBJ_COMMIT)
				2740	die("Non commit %s?", revs->pending.objects[i].name);
				2741	if (found)
				2742	die("More than one commit to dig up from, %s and %s?",
				2743	revs->pending.objects[i].name, name);
				2744	found = (struct commit *) obj;
				2745	name = revs->pending.objects[i].name;
				2746	}
				2747
				2748	if (!name)
				2749	found = dwim_reverse_initial(revs, &name);
				2750	if (!name)
				2751	die("No commit to dig up from?");
				2752
				2753	if (name_p)
SZEDER Gábor	9e7d8a9	2017-07-24 23:15:50 +0200	[diff] [blame]	2754	*name_p = xstrdup(name);
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2755	return found;
				2756	}
				2757
				2758	void init_scoreboard(struct blame_scoreboard *sb)
				2759	{
				2760	memset(sb, 0, sizeof(struct blame_scoreboard));
				2761	sb->move_score = BLAME_DEFAULT_MOVE_SCORE;
				2762	sb->copy_score = BLAME_DEFAULT_COPY_SCORE;
				2763	}
				2764
Nguyễn Thái Ngọc Duy	ecbbc0a	2018-08-13 18:14:41 +0200	[diff] [blame]	2765	void setup_scoreboard(struct blame_scoreboard *sb,
				2766	const char *path,
				2767	struct blame_origin **orig)
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2768	{
				2769	const char *final_commit_name = NULL;
				2770	struct blame_origin *o;
				2771	struct commit *final_commit = NULL;
				2772	enum object_type type;
				2773
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	2774	init_blame_suspects(&blame_suspects);
				2775
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2776	if (sb->reverse && sb->contents_from)
				2777	die(_("--contents and --reverse do not blend well."));
				2778
Nguyễn Thái Ngọc Duy	ecbbc0a	2018-08-13 18:14:41 +0200	[diff] [blame]	2779	if (!sb->repo)
				2780	BUG("repo is NULL");
				2781
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2782	if (!sb->reverse) {
				2783	sb->final = find_single_final(sb->revs, &final_commit_name);
				2784	sb->commits.compare = compare_commits_by_commit_date;
				2785	} else {
				2786	sb->final = find_single_initial(sb->revs, &final_commit_name);
				2787	sb->commits.compare = compare_commits_by_reverse_commit_date;
				2788	}
				2789
				2790	if (sb->final && sb->contents_from)
				2791	die(_("cannot use --contents with final commit object name"));
				2792
				2793	if (sb->reverse && sb->revs->first_parent_only)
				2794	sb->revs->children.name = NULL;
				2795
				2796	if (!sb->final) {
				2797	/*
				2798	* "--not A B -- path" without anything positive;
				2799	* do not default to HEAD, but use the working tree
				2800	* or "--contents".
				2801	*/
				2802	setup_work_tree();
Nguyễn Thái Ngọc Duy	ecbbc0a	2018-08-13 18:14:41 +0200	[diff] [blame]	2803	sb->final = fake_working_tree_commit(sb->repo,
				2804	&sb->revs->diffopt,
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2805	path, sb->contents_from);
				2806	add_pending_object(sb->revs, &(sb->final->object), ":");
				2807	}
				2808
				2809	if (sb->reverse && sb->revs->first_parent_only) {
				2810	final_commit = find_single_final(sb->revs, NULL);
				2811	if (!final_commit)
				2812	die(_("--reverse and --first-parent together require specified latest commit"));
				2813	}
				2814
				2815	/*
				2816	* If we have bottom, this will mark the ancestors of the
				2817	* bottom commits we would reach while traversing as
				2818	* uninteresting.
				2819	*/
				2820	if (prepare_revision_walk(sb->revs))
				2821	die(_("revision walk setup failed"));
				2822
				2823	if (sb->reverse && sb->revs->first_parent_only) {
				2824	struct commit *c = final_commit;
				2825
				2826	sb->revs->children.name = "children";
				2827	while (c->parents &&
Jeff King	9001dc2	2018-08-28 17:22:48 -0400	[diff] [blame]	2828	!oideq(&c->object.oid, &sb->final->object.oid)) {
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2829	struct commit_list l = xcalloc(1, sizeof(l));
				2830
				2831	l->item = c;
				2832	if (add_decoration(&sb->revs->children,
				2833	&c->parents->item->object, l))
Johannes Schindelin	033abf9	2018-05-02 11:38:39 +0200	[diff] [blame]	2834	BUG("not unique item in first-parent chain");
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2835	c = c->parents->item;
				2836	}
				2837
Jeff King	9001dc2	2018-08-28 17:22:48 -0400	[diff] [blame]	2838	if (!oideq(&c->object.oid, &sb->final->object.oid))
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2839	die(_("--reverse --first-parent together require range along first-parent chain"));
				2840	}
				2841
				2842	if (is_null_oid(&sb->final->object.oid)) {
Nguyễn Thái Ngọc Duy	4e0df4e	2018-05-19 07:28:19 +0200	[diff] [blame]	2843	o = get_blame_suspects(sb->final);
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2844	sb->final_buf = xmemdupz(o->file.ptr, o->file.size);
				2845	sb->final_buf_size = o->file.size;
				2846	}
				2847	else {
				2848	o = get_origin(sb->final, path);
Nguyễn Thái Ngọc Duy	ecbbc0a	2018-08-13 18:14:41 +0200	[diff] [blame]	2849	if (fill_blob_sha1_and_mode(sb->repo, o))
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2850	die(_("no such path %s in %s"), path, final_commit_name);
				2851
Brandon Williams	0d1e0e7	2017-10-31 11:19:11 -0700	[diff] [blame]	2852	if (sb->revs->diffopt.flags.allow_textconv &&
Nguyễn Thái Ngọc Duy	6afaf80	2018-09-21 17:57:22 +0200	[diff] [blame]	2853	textconv_object(sb->repo, path, o->mode, &o->blob_oid, 1, (char **) &sb->final_buf,
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2854	&sb->final_buf_size))
				2855	;
				2856	else
brian m. carlson	b4f5aca	2018-03-12 02:27:53 +0000	[diff] [blame]	2857	sb->final_buf = read_object_file(&o->blob_oid, &type,
				2858	&sb->final_buf_size);
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2859
				2860	if (!sb->final_buf)
				2861	die(_("cannot read blob %s for path %s"),
				2862	oid_to_hex(&o->blob_oid),
				2863	path);
				2864	}
				2865	sb->num_read_blob++;
				2866	prepare_lines(sb);
				2867
				2868	if (orig)
				2869	*orig = o;
SZEDER Gábor	9e7d8a9	2017-07-24 23:15:50 +0200	[diff] [blame]	2870
				2871	free((char *)final_commit_name);
Jeff Smith	09002f1	2017-05-24 00:15:36 -0500	[diff] [blame]	2872	}
Jeff Smith	bd481de	2017-05-24 00:15:37 -0500	[diff] [blame]	2873
				2874
				2875
				2876	struct blame_entry blame_entry_prepend(struct blame_entry head,
				2877	long start, long end,
				2878	struct blame_origin *o)
				2879	{
				2880	struct blame_entry *new_head = xcalloc(1, sizeof(struct blame_entry));
				2881	new_head->lno = start;
				2882	new_head->num_lines = end - start;
				2883	new_head->suspect = o;
				2884	new_head->s_lno = start;
				2885	new_head->next = head;
				2886	blame_origin_incref(o);
				2887	return new_head;
				2888	}
Derrick Stolee	0906ac2	2020-04-16 20:14:04 +0000	[diff] [blame]	2889
				2890	void setup_blame_bloom_data(struct blame_scoreboard *sb,
				2891	const char *path)
				2892	{
				2893	struct blame_bloom_data *bd;
				2894
				2895	if (!sb->repo->objects->commit_graph)
				2896	return;
				2897
				2898	if (!sb->repo->objects->commit_graph->bloom_filter_settings)
				2899	return;
				2900
				2901	bd = xmalloc(sizeof(struct blame_bloom_data));
				2902
				2903	bd->settings = sb->repo->objects->commit_graph->bloom_filter_settings;
				2904
				2905	bd->alloc = 4;
				2906	bd->nr = 0;
				2907	ALLOC_ARRAY(bd->keys, bd->alloc);
				2908
				2909	add_bloom_key(bd, path);
				2910
				2911	sb->bloom_data = bd;
				2912	}
				2913
				2914	void cleanup_scoreboard(struct blame_scoreboard *sb)
				2915	{
				2916	if (sb->bloom_data) {
				2917	int i;
				2918	for (i = 0; i < sb->bloom_data->nr; i++) {
				2919	free(sb->bloom_data->keys[i]->hashes);
				2920	free(sb->bloom_data->keys[i]);
				2921	}
				2922	free(sb->bloom_data->keys);
				2923	FREE_AND_NULL(sb->bloom_data);
				2924
				2925	trace2_data_intmax("blame", sb->repo,
				2926	"bloom/queries", bloom_count_queries);
				2927	trace2_data_intmax("blame", sb->repo,
				2928	"bloom/response-no", bloom_count_no);
				2929	}
				2930	}