Merge branch 'bc/utf16-portability-fix' The code and tests assume that the system supplied iconv() would always use BOM in its output when asked to encode to UTF-16 (or UTF-32), but apparently some implementations output big-endian without BOM. A compile-time knob has been added to help such systems (e.g. NonStop) to add BOM to the output to increase portability. * bc/utf16-portability-fix: utf8: handle systems that don't write BOM for UTF-16

commit: 18f9fb687f708b568301a4af87194fd72e4010e0 [log] [tgz]
author: Junio C Hamano <gitster@pobox.com> Wed Feb 13 18:18:41 2019 -0800
committer: Junio C Hamano <gitster@pobox.com> Wed Feb 13 18:18:41 2019 -0800
tree: d6558509b0731622ee837aa05318a2ee81301fea
parent: 1db999ce8d369dbf20532f1b2ea8cbbcf6f2cc7b [diff]
parent: 79444c92943048f9ac62e9311038ebe43f5f0982 [diff]
diff --git a/Makefile b/Makefile
index af7e809..f0b2299 100644
--- a/Makefile
+++ b/Makefile

@@ -259,6 +259,10 @@
 # Define OLD_ICONV if your library has an old iconv(), where the second
 # (input buffer pointer) parameter is declared with type (const char **).
 #
+# Define ICONV_OMITS_BOM if your iconv implementation does not write a
+# byte-order mark (BOM) when writing UTF-16 or UTF-32 and always writes in
+# big-endian format.
+#
 # Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
 #
 # Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
@@ -1417,6 +1421,9 @@
 		EXTLIBS += $(ICONV_LINK) -liconv
 	endif
 endif
+ifdef ICONV_OMITS_BOM
+	BASIC_CFLAGS += -DICONV_OMITS_BOM
+endif
 ifdef NEEDS_LIBGEN
 	EXTLIBS += -lgen
 endif

diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh
index e58ecbf..500229a 100755
--- a/t/t0028-working-tree-encoding.sh
+++ b/t/t0028-working-tree-encoding.sh

@@ -6,6 +6,30 @@
 
 GIT_TRACE_WORKING_TREE_ENCODING=1 && export GIT_TRACE_WORKING_TREE_ENCODING
 
+test_lazy_prereq NO_UTF16_BOM '
+	test $(printf abc | iconv -f UTF-8 -t UTF-16 | wc -c) = 6
+'
+
+test_lazy_prereq NO_UTF32_BOM '
+	test $(printf abc | iconv -f UTF-8 -t UTF-32 | wc -c) = 12
+'
+
+write_utf16 () {
+	if test_have_prereq NO_UTF16_BOM
+	then
+		printf '\xfe\xff'
+	fi &&
+	iconv -f UTF-8 -t UTF-16
+}
+
+write_utf32 () {
+	if test_have_prereq NO_UTF32_BOM
+	then
+		printf '\x00\x00\xfe\xff'
+	fi &&
+	iconv -f UTF-8 -t UTF-32
+}
+
 test_expect_success 'setup test files' '
 	git config core.eol lf &&
 
@@ -13,8 +37,8 @@
 	echo "*.utf16 text working-tree-encoding=utf-16" >.gitattributes &&
 	echo "*.utf16lebom text working-tree-encoding=UTF-16LE-BOM" >>.gitattributes &&
 	printf "$text" >test.utf8.raw &&
-	printf "$text" | iconv -f UTF-8 -t UTF-16 >test.utf16.raw &&
-	printf "$text" | iconv -f UTF-8 -t UTF-32 >test.utf32.raw &&
+	printf "$text" | write_utf16 >test.utf16.raw &&
+	printf "$text" | write_utf32 >test.utf32.raw &&
 	printf "\377\376"                         >test.utf16lebom.raw &&
 	printf "$text" | iconv -f UTF-8 -t UTF-32LE >>test.utf16lebom.raw &&
 
@@ -124,8 +148,8 @@
 		test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" &&
 		test_when_finished "git reset --hard HEAD^" &&
 
-		cat lf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >lf.utf${i}.raw &&
-		cat crlf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >crlf.utf${i}.raw &&
+		cat lf.utf8.raw | write_utf${i} >lf.utf${i}.raw &&
+		cat crlf.utf8.raw | write_utf${i} >crlf.utf${i}.raw &&
 		cp crlf.utf${i}.raw eol.utf${i} &&
 
 		cat >expectIndexLF <<-EOF &&
@@ -223,7 +247,7 @@
 
 	text="hallo there!\nroundtrip test here!" &&
 	printf "$text" | iconv -f UTF-8 -t SHIFT-JIS >roundtrip.shift &&
-	printf "$text" | iconv -f UTF-8 -t UTF-16 >roundtrip.utf16 &&
+	printf "$text" | write_utf16 >roundtrip.utf16 &&
 	echo "*.shift text working-tree-encoding=SHIFT-JIS" >>.gitattributes &&
 
 	# SHIFT-JIS encoded files are round-trip checked by default...

diff --git a/utf8.c b/utf8.c
index 83824dc..3b42fad 100644
--- a/utf8.c
+++ b/utf8.c

@@ -559,6 +559,10 @@
 	/*
 	 * For writing, UTF-16 iconv typically creates "UTF-16BE-BOM"
 	 * Some users under Windows want the little endian version
+	 *
+	 * We handle UTF-16 and UTF-32 ourselves only if the platform does not
+	 * provide a BOM (which we require), since we want to match the behavior
+	 * of the system tools and libc as much as possible.
 	 */
 	if (same_utf_encoding("UTF-16LE-BOM", out_encoding)) {
 		bom_str = utf16_le_bom;
@@ -568,6 +572,16 @@
 		bom_str = utf16_be_bom;
 		bom_len = sizeof(utf16_be_bom);
 		out_encoding = "UTF-16BE";
+#ifdef ICONV_OMITS_BOM
+	} else if (same_utf_encoding("UTF-16", out_encoding)) {
+		bom_str = utf16_be_bom;
+		bom_len = sizeof(utf16_be_bom);
+		out_encoding = "UTF-16BE";
+	} else if (same_utf_encoding("UTF-32", out_encoding)) {
+		bom_str = utf32_be_bom;
+		bom_len = sizeof(utf32_be_bom);
+		out_encoding = "UTF-32BE";
+#endif
 	}
 
 	conv = iconv_open(out_encoding, in_encoding);
commit	18f9fb687f708b568301a4af87194fd72e4010e0	[log] [tgz]
author	Junio C Hamano <gitster@pobox.com>	Wed Feb 13 18:18:41 2019 -0800
committer	Junio C Hamano <gitster@pobox.com>	Wed Feb 13 18:18:41 2019 -0800
tree	d6558509b0731622ee837aa05318a2ee81301fea
parent	1db999ce8d369dbf20532f1b2ea8cbbcf6f2cc7b [diff]
parent	79444c92943048f9ac62e9311038ebe43f5f0982 [diff]