t/lib-unicode-nfc-nfd.sh - jrn/git - Git at Google

 # Help detect how Unicode NFC and NFD are handled on the filesystem.

 # A simple character that has a NFD form.
 #
 # NFC:       U+00e9 LATIN SMALL LETTER E WITH ACUTE
 # UTF8(NFC): \xc3 \xa9
 #
 # NFD:       U+0065 LATIN SMALL LETTER E
 #            U+0301 COMBINING ACUTE ACCENT
 # UTF8(NFD): \x65  +  \xcc \x81
 #
 utf8_nfc=$(printf "\xc3\xa9")
 utf8_nfd=$(printf "\x65\xcc\x81")

 # Is the OS or the filesystem "Unicode composition sensitive"?
 #
 # That is, does the OS or the filesystem allow files to exist with
 # both the NFC and NFD spellings?  Or, does the OS/FS lie to us and
 # tell us that the NFC and NFD forms are equivalent.
 #
 # This is or may be independent of what type of filesystem we have,
 # since it might be handled by the OS at a layer above the FS.
 # Testing shows on MacOS using APFS, HFS+, and FAT32 reports a
 # collision, for example.
 #
 # This does not tell us how the Unicode pathname will be spelled
 # on disk, but rather only that the two spelling "collide".  We
 # will examine the actual on disk spelling in a later prereq.
 #
 test_lazy_prereq UNICODE_COMPOSITION_SENSITIVE '
 	mkdir trial_${utf8_nfc} &&
 	mkdir trial_${utf8_nfd}
 '

 # Is the spelling of an NFC pathname preserved on disk?
 #
 # On MacOS with HFS+ and FAT32, NFC paths are converted into NFD
 # and on APFS, NFC paths are preserved.  As we have established
 # above, this is independent of "composition sensitivity".
 #
 test_lazy_prereq UNICODE_NFC_PRESERVED '
 	mkdir c_${utf8_nfc} &&
 	ls | test-tool hexdump >dump &&
 	grep "63 5f c3 a9" dump
 '

 # Is the spelling of an NFD pathname preserved on disk?
 #
 test_lazy_prereq UNICODE_NFD_PRESERVED '
 	mkdir d_${utf8_nfd} &&
 	ls | test-tool hexdump >dump &&
 	grep "64 5f 65 cc 81" dump
 '

 # The following _DOUBLE_ forms are more for my curiosity,
 # but there may be quirks lurking when there are multiple
 # combining characters in non-canonical order.

 # Unicode also allows multiple combining characters
 # that can be decomposed in pieces.
 #
 # NFC:        U+1f67 GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI
 # UTF8(NFC):  \xe1 \xbd \xa7
 #
 # NFD1:       U+1f61 GREEK SMALL LETTER OMEGA WITH DASIA
 #             U+0342 COMBINING GREEK PERISPOMENI
 # UTF8(NFD1): \xe1 \xbd \xa1  +  \xcd \x82
 #
 # But U+1f61 decomposes into
 # NFD2:       U+03c9 GREEK SMALL LETTER OMEGA
 #             U+0314 COMBINING REVERSED COMMA ABOVE
 # UTF8(NFD2): \xcf \x89  +  \xcc \x94
 #
 # Yielding:   \xcf \x89  +  \xcc \x94  +  \xcd \x82
 #
 # Note that I've used the canonical ordering of the
 # combinining characters.  It is also possible to
 # swap them.  My testing shows that that non-standard
 # ordering also causes a collision in mkdir.  However,
 # the resulting names don't draw correctly on the
 # terminal (implying that the on-disk format also has
 # them out of order).
 #
 greek_nfc=$(printf "\xe1\xbd\xa7")
 greek_nfd1=$(printf "\xe1\xbd\xa1\xcd\x82")
 greek_nfd2=$(printf "\xcf\x89\xcc\x94\xcd\x82")

 # See if a double decomposition also collides.
 #
 test_lazy_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE '
 	mkdir trial_${greek_nfc} &&
 	mkdir trial_${greek_nfd2}
 '

 # See if the NFC spelling appears on the disk.
 #
 test_lazy_prereq UNICODE_DOUBLE_NFC_PRESERVED '
 	mkdir c_${greek_nfc} &&
 	ls | test-tool hexdump >dump &&
 	grep "63 5f e1 bd a7" dump
 '

 # See if the NFD spelling appears on the disk.
 #
 test_lazy_prereq UNICODE_DOUBLE_NFD_PRESERVED '
 	mkdir d_${greek_nfd2} &&
 	ls | test-tool hexdump >dump &&
 	grep "64 5f cf 89 cc 94 cd 82" dump
 '

 # The following is for debugging. I found it useful when
 # trying to understand the various (OS, FS) quirks WRT
 # Unicode and how composition/decomposition is handled.
 # For example, when trying to understand how (macOS, APFS)
 # and (macOS, HFS) and (macOS, FAT32) compare.
 #
 # It is rather noisy, so it is disabled by default.
 #
 if test "$unicode_debug" = "true"
 then
 	if test_have_prereq UNICODE_COMPOSITION_SENSITIVE
 	then
 		echo NFC and NFD are distinct on this OS/filesystem.
 	else
 		echo NFC and NFD are aliases on this OS/filesystem.
 	fi

 	if test_have_prereq UNICODE_NFC_PRESERVED
 	then
 		echo NFC maintains original spelling.
 	else
 		echo NFC is modified.
 	fi

 	if test_have_prereq UNICODE_NFD_PRESERVED
 	then
 		echo NFD maintains original spelling.
 	else
 		echo NFD is modified.
 	fi

 	if test_have_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE
 	then
 		echo DOUBLE NFC and NFD are distinct on this OS/filesystem.
 	else
 		echo DOUBLE NFC and NFD are aliases on this OS/filesystem.
 	fi

 	if test_have_prereq UNICODE_DOUBLE_NFC_PRESERVED
 	then
 		echo Double NFC maintains original spelling.
 	else
 		echo Double NFC is modified.
 	fi

 	if test_have_prereq UNICODE_DOUBLE_NFD_PRESERVED
 	then
 		echo Double NFD maintains original spelling.
 	else
 		echo Double NFD is modified.
 	fi
 fi
	# Help detect how Unicode NFC and NFD are handled on the filesystem.

	# A simple character that has a NFD form.
	#
	# NFC: U+00e9 LATIN SMALL LETTER E WITH ACUTE
	# UTF8(NFC): \xc3 \xa9
	#
	# NFD: U+0065 LATIN SMALL LETTER E
	# U+0301 COMBINING ACUTE ACCENT
	# UTF8(NFD): \x65 + \xcc \x81
	#
	utf8_nfc=$(printf "\xc3\xa9")
	utf8_nfd=$(printf "\x65\xcc\x81")

	# Is the OS or the filesystem "Unicode composition sensitive"?
	#
	# That is, does the OS or the filesystem allow files to exist with
	# both the NFC and NFD spellings? Or, does the OS/FS lie to us and
	# tell us that the NFC and NFD forms are equivalent.
	#
	# This is or may be independent of what type of filesystem we have,
	# since it might be handled by the OS at a layer above the FS.
	# Testing shows on MacOS using APFS, HFS+, and FAT32 reports a
	# collision, for example.
	#
	# This does not tell us how the Unicode pathname will be spelled
	# on disk, but rather only that the two spelling "collide". We
	# will examine the actual on disk spelling in a later prereq.
	#
	test_lazy_prereq UNICODE_COMPOSITION_SENSITIVE '
	mkdir trial_${utf8_nfc} &&
	mkdir trial_${utf8_nfd}
	'

	# Is the spelling of an NFC pathname preserved on disk?
	#
	# On MacOS with HFS+ and FAT32, NFC paths are converted into NFD
	# and on APFS, NFC paths are preserved. As we have established
	# above, this is independent of "composition sensitivity".
	#
	test_lazy_prereq UNICODE_NFC_PRESERVED '
	mkdir c_${utf8_nfc} &&
	ls \| test-tool hexdump >dump &&
	grep "63 5f c3 a9" dump
	'

	# Is the spelling of an NFD pathname preserved on disk?
	#
	test_lazy_prereq UNICODE_NFD_PRESERVED '
	mkdir d_${utf8_nfd} &&
	ls \| test-tool hexdump >dump &&
	grep "64 5f 65 cc 81" dump
	'

	# The following _DOUBLE_ forms are more for my curiosity,
	# but there may be quirks lurking when there are multiple
	# combining characters in non-canonical order.

	# Unicode also allows multiple combining characters
	# that can be decomposed in pieces.
	#
	# NFC: U+1f67 GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI
	# UTF8(NFC): \xe1 \xbd \xa7
	#
	# NFD1: U+1f61 GREEK SMALL LETTER OMEGA WITH DASIA
	# U+0342 COMBINING GREEK PERISPOMENI
	# UTF8(NFD1): \xe1 \xbd \xa1 + \xcd \x82
	#
	# But U+1f61 decomposes into
	# NFD2: U+03c9 GREEK SMALL LETTER OMEGA
	# U+0314 COMBINING REVERSED COMMA ABOVE
	# UTF8(NFD2): \xcf \x89 + \xcc \x94
	#
	# Yielding: \xcf \x89 + \xcc \x94 + \xcd \x82
	#
	# Note that I've used the canonical ordering of the
	# combinining characters. It is also possible to
	# swap them. My testing shows that that non-standard
	# ordering also causes a collision in mkdir. However,
	# the resulting names don't draw correctly on the
	# terminal (implying that the on-disk format also has
	# them out of order).
	#
	greek_nfc=$(printf "\xe1\xbd\xa7")
	greek_nfd1=$(printf "\xe1\xbd\xa1\xcd\x82")
	greek_nfd2=$(printf "\xcf\x89\xcc\x94\xcd\x82")

	# See if a double decomposition also collides.
	#
	test_lazy_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE '
	mkdir trial_${greek_nfc} &&
	mkdir trial_${greek_nfd2}
	'

	# See if the NFC spelling appears on the disk.
	#
	test_lazy_prereq UNICODE_DOUBLE_NFC_PRESERVED '
	mkdir c_${greek_nfc} &&
	ls \| test-tool hexdump >dump &&
	grep "63 5f e1 bd a7" dump
	'

	# See if the NFD spelling appears on the disk.
	#
	test_lazy_prereq UNICODE_DOUBLE_NFD_PRESERVED '
	mkdir d_${greek_nfd2} &&
	ls \| test-tool hexdump >dump &&
	grep "64 5f cf 89 cc 94 cd 82" dump
	'

	# The following is for debugging. I found it useful when
	# trying to understand the various (OS, FS) quirks WRT
	# Unicode and how composition/decomposition is handled.
	# For example, when trying to understand how (macOS, APFS)
	# and (macOS, HFS) and (macOS, FAT32) compare.
	#
	# It is rather noisy, so it is disabled by default.
	#
	if test "$unicode_debug" = "true"
	then
	if test_have_prereq UNICODE_COMPOSITION_SENSITIVE
	then
	echo NFC and NFD are distinct on this OS/filesystem.
	else
	echo NFC and NFD are aliases on this OS/filesystem.
	fi

	if test_have_prereq UNICODE_NFC_PRESERVED
	then
	echo NFC maintains original spelling.
	else
	echo NFC is modified.
	fi

	if test_have_prereq UNICODE_NFD_PRESERVED
	then
	echo NFD maintains original spelling.
	else
	echo NFD is modified.
	fi

	if test_have_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE
	then
	echo DOUBLE NFC and NFD are distinct on this OS/filesystem.
	else
	echo DOUBLE NFC and NFD are aliases on this OS/filesystem.
	fi

	if test_have_prereq UNICODE_DOUBLE_NFC_PRESERVED
	then
	echo Double NFC maintains original spelling.
	else
	echo Double NFC is modified.
	fi

	if test_have_prereq UNICODE_DOUBLE_NFD_PRESERVED
	then
	echo Double NFD maintains original spelling.
	else
	echo Double NFD is modified.
	fi
	fi