[PATCH] diff overhaul

This cleans up the way calls are made into the diff core from diff-tree
family and diff-helper.  Earlier, these programs had "if
(generating_patch)" sprinkled all over the place, but those ugliness are
gone and handled uniformly from the diff core, even when not generating
patch format.

This also allowed diff-cache and diff-files to acquire -R
(reverse) option to generate diff in reverse.  Users of
diff-tree can swap two trees easily so I did not add -R there.

[ Linus' note: I'll add -R to "diff-tree" too, since a "commit
  diff" doesn't have another tree to switch around: the other
  tree is always the parent(s) of the commit ]

Also -M<digits-as-mantissa> suggestion made by Linus has been
implemented.

Documentation updates are also included.

Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/diff.c b/diff.c
index 92fcc67..c4b6b76 100644
--- a/diff.c
+++ b/diff.c
@@ -11,6 +11,8 @@
 
 static const char *diff_opts = "-pu";
 static unsigned char null_sha1[20] = { 0, };
+#define MAX_SCORE 10000
+#define DEFAULT_MINIMUM_SCORE 5000
 
 static const char *external_diff(void)
 {
@@ -55,7 +57,7 @@
 	const char *cp;
 	char *bp;
 
-	/* count bytes needed to store the quoted string. */ 
+	/* count bytes needed to store the quoted string. */
 	for (cnt = 1, cp = src; *cp; cnt++, cp++)
 		if (*cp == '\'')
 			cnt += 3;
@@ -93,7 +95,8 @@
 
 static void builtin_diff(const char *name_a,
 			 const char *name_b,
-			 struct diff_tempfile *temp)
+			 struct diff_tempfile *temp,
+			 int rename_score)
 {
 	int i, next_at, cmd_size;
 	const char *diff_cmd = "diff -L'%s%s' -L'%s%s'";
@@ -149,6 +152,10 @@
 			printf("new mode %s\n", temp[1].mode);
 		}
 		if (strcmp(name_a, name_b)) {
+			if (0 < rename_score)
+				printf("rename similarity index %d%%\n",
+				       (int)(0.5+
+					     rename_score*100.0/MAX_SCORE));
 			printf("rename old %s\n", name_a);
 			printf("rename new %s\n", name_b);
 		}
@@ -184,7 +191,7 @@
 	 * file.  Practically, this code only helps when we are used
 	 * by diff-cache --cached, which does read the cache before
 	 * calling us.
-	 */ 
+	 */
 	if (!active_cache)
 		return 0;
 
@@ -302,9 +309,10 @@
 
 static int detect_rename;
 static int reverse_diff;
+static int diff_raw_output = -1;
 static const char **pathspec;
 static int speccnt;
-static int diff_rename_minimum_score;
+static int minimum_score;
 
 static int matches_pathspec(const char *name)
 {
@@ -334,7 +342,8 @@
 static void run_external_diff(const char *name,
 			      const char *other,
 			      struct diff_spec *one,
-			      struct diff_spec *two)
+			      struct diff_spec *two,
+			      int rename_score)
 {
 	struct diff_tempfile *temp = diff_temp;
 	pid_t pid;
@@ -395,7 +404,7 @@
 		 * otherwise we use the built-in one.
 		 */
 		if (one && two)
-			builtin_diff(name, other ? : name, temp);
+			builtin_diff(name, other ? : name, temp, rename_score);
 		else
 			printf("* Unmerged path %s\n", name);
 		exit(0);
@@ -446,7 +455,7 @@
 		die("internal error");
 
 	if (!detect_rename) {
-		run_external_diff(name, NULL, one, two);
+		run_external_diff(name, NULL, one, two, -1);
 		return;
 	}
 	elem = xmalloc(sizeof(*elem) + strlen(name));
@@ -519,10 +528,10 @@
 			continue;
 		if (on_created_list)
 			run_external_diff(elem->path, NULL,
-					  &null_file_spec, &elem->it);
+					  &null_file_spec, &elem->it, -1);
 		else
 			run_external_diff(elem->path, NULL,
-					  &elem->it, &null_file_spec);
+					  &elem->it, &null_file_spec, -1);
 	}
 }
 
@@ -541,7 +550,6 @@
 	return 0;
 }
 
-#define MINIMUM_SCORE 5000
 int estimate_similarity(struct diff_spec_hold *src, struct diff_spec_hold *dst)
 {
 	/* src points at a deleted file and dst points at a created
@@ -549,20 +557,24 @@
 	 * say src is renamed to dst.
 	 *
 	 * Compare them and return how similar they are, representing
-	 * the score as an integer between 0 and 10000.  10000 is
-	 * reserved for the case where they match exactly.
+	 * the score as an integer between 0 and 10000, except
+	 * where they match exactly it is considered better than anything
+	 * else.
 	 */
 	void *delta;
 	unsigned long delta_size;
+	int score;
 
 	delta_size = ((src->size < dst->size) ?
 		      (dst->size - src->size) : (src->size - dst->size));
 
 	/* We would not consider rename followed by more than
-	 * 20% edits; that is, delta_size must be smaller than
-	 * (src->size + dst->size)/2 * 0.2, which means...
+	 * minimum_score/MAX_SCORE edits; that is, delta_size must be smaller
+	 * than (src->size + dst->size)/2 * minimum_score/MAX_SCORE,
+	 * which means...
 	 */
-	if ((src->size + dst->size) < delta_size * 10)
+
+	if ((src->size+dst->size)*minimum_score < delta_size*MAX_SCORE*2)
 		return 0;
 
 	delta = diff_delta(src->data, src->size,
@@ -573,14 +585,17 @@
 	/* This "delta" is really xdiff with adler32 and all the
 	 * overheads but it is a quick and dirty approximation.
 	 *
-	 * Now we will give some score to it.  Let's say 20% edit gets
-	 * 5000 points and 0% edit gets 9000 points.  That is, every
-	 * 1/20000 edit gets 1 point penalty.  The amount of penalty is:
+	 * Now we will give some score to it.  100% edit gets
+	 * 0 points and 0% edit gets MAX_SCORE points.  That is, every
+	 * 1/MAX_SCORE edit gets 1 point penalty.  The amount of penalty is:
 	 *
-	 * (delta_size * 2 / (src->size + dst->size)) * 20000
+	 * (delta_size * 2 / (src->size + dst->size)) * MAX_SCORE
 	 *
 	 */
-	return 9000 - (40000 * delta_size / (src->size+dst->size));
+	score = MAX_SCORE-(MAX_SCORE*2*delta_size/(src->size+dst->size));
+	if (score < 0) return 0;
+	if (MAX_SCORE < score) return MAX_SCORE;
+	return score;
 }
 
 struct diff_score {
@@ -596,14 +611,15 @@
 }
 
 static void flush_rename_pair(struct diff_spec_hold *src,
-			      struct diff_spec_hold *dst)
+			      struct diff_spec_hold *dst,
+			      int rename_score)
 {
 	src->flags |= MATCHED;
 	dst->flags |= MATCHED;
 	free_data(src);
 	free_data(dst);
 	run_external_diff(src->path, dst->path,
-			  &src->it, &dst->it);
+			  &src->it, &dst->it, rename_score);
 }
 
 static void free_held_diff(struct diff_spec_hold *list)
@@ -637,7 +653,7 @@
 				continue;
 			if (! is_exact_match(src, dst))
 				continue;
-			flush_rename_pair(src, dst);
+			flush_rename_pair(src, dst, MAX_SCORE);
 			break;
 		}
 	}
@@ -670,7 +686,7 @@
 		}
 		c++;
 	}
-	qsort(mx, num_create*num_delete, sizeof(*mx), score_compare); 
+	qsort(mx, num_create*num_delete, sizeof(*mx), score_compare);
 
 #if 0
  	for (c = 0; c < num_create * num_delete; c++) {
@@ -689,9 +705,9 @@
 		dst = mx[c].dst;
 		if ((src->flags & MATCHED) || (dst->flags & MATCHED))
 			continue;
-		if (mx[c].score < diff_rename_minimum_score)
+		if (mx[c].score < minimum_score)
 			break;
-		flush_rename_pair(src, dst);
+		flush_rename_pair(src, dst, mx[c].score);
 	}
 	free(mx);
 
@@ -703,7 +719,26 @@
 	createdfile = deletedfile = NULL;
 }
 
+int diff_scoreopt_parse(const char *opt)
+{
+	int diglen, num, scale, i;
+	if (opt[0] != '-' || opt[1] != 'M')
+		return -1; /* that is not -M option */
+	diglen = strspn(opt+2, "0123456789");
+	if (diglen == 0 || strlen(opt+2) != diglen)
+		return 0; /* use default */
+	sscanf(opt+2, "%d", &num);
+	for (i = 0, scale = 1; i < diglen; i++)
+		scale *= 10;
+
+	/* user says num divided by scale and we say internally that
+	 * is MAX_SCORE * num / scale.
+	 */
+	return MAX_SCORE * num / scale;
+}
+
 void diff_setup(int detect_rename_, int minimum_score_, int reverse_diff_,
+		int diff_raw_output_,
 		const char **pathspec_, int speccnt_)
 {
 	free_held_diff(createdfile);
@@ -713,8 +748,14 @@
 	detect_rename = detect_rename_;
 	reverse_diff = reverse_diff_;
 	pathspec = pathspec_;
+	diff_raw_output = diff_raw_output_;
 	speccnt = speccnt_;
-	diff_rename_minimum_score = minimum_score_ ? : MINIMUM_SCORE;
+	minimum_score = minimum_score_ ? : DEFAULT_MINIMUM_SCORE;
+}
+
+static const char *git_object_type(unsigned mode)
+{
+	return S_ISDIR(mode) ? "tree" : "blob";
 }
 
 void diff_addremove(int addremove, unsigned mode,
@@ -724,6 +765,21 @@
 	char concatpath[PATH_MAX];
 	struct diff_spec spec[2], *one, *two;
 
+	if (0 <= diff_raw_output) {
+		if (!path)
+			path = "";
+		if (reverse_diff)
+			addremove = (addremove == '+' ? '-' : '+');
+		printf("%c%06o %s %s %s%s%c",
+		       addremove,
+		       mode,
+		       git_object_type(mode), sha1_to_hex(sha1),
+		       base, path, diff_raw_output);
+		return;
+	}
+	if (S_ISDIR(mode))
+		return;
+
 	memcpy(spec[0].blob_sha1, sha1, 20);
 	spec[0].mode = mode;
 	spec[0].sha1_valid = !!memcmp(sha1, null_sha1, 20);
@@ -750,6 +806,29 @@
 	char concatpath[PATH_MAX];
 	struct diff_spec spec[2];
 
+	if (0 <= diff_raw_output) {
+		char old_hex[41];
+		strcpy(old_hex, sha1_to_hex(old_sha1));
+
+		if (!path)
+			path = "";
+		if (reverse_diff)
+			printf("*%06o->%06o %s %s->%s %s%s%c",
+			       new_mode, old_mode,
+			       git_object_type(new_mode),
+			       sha1_to_hex(new_sha1), old_hex,
+			       base, path, diff_raw_output);
+		else
+			printf("*%06o->%06o %s %s->%s %s%s%c",
+			       old_mode, new_mode,
+			       git_object_type(new_mode),
+			       old_hex, sha1_to_hex(new_sha1),
+			       base, path, diff_raw_output);
+		return;
+	}
+	if (S_ISDIR(new_mode))
+		return;
+
 	if (path) {
 		strcpy(concatpath, base);
 		strcat(concatpath, path);
@@ -766,10 +845,15 @@
 	/* We do not look at changed files as candidate for
 	 * rename detection ever.
 	 */
-	run_external_diff(path ? concatpath : base, NULL, &spec[0], &spec[1]);
+	run_external_diff(path ? concatpath : base, NULL,
+			  &spec[0], &spec[1], -1);
 }
 
 void diff_unmerge(const char *path)
 {
-	run_external_diff(path, NULL, NULL, NULL);
+	if (0 <= diff_raw_output) {
+		printf("U %s%c", path, diff_raw_output);
+		return;
+	}
+	run_external_diff(path, NULL, NULL, NULL, -1);
 }