git: f4f4fa8d04df - stable/13 - localedata: add some exceptions to utf8proc widths

From: Kyle Evans <kevans_at_FreeBSD.org>
Date: Fri, 22 Nov 2024 04:53:54 UTC
The branch stable/13 has been updated by kevans:

URL: https://cgit.FreeBSD.org/src/commit/?id=f4f4fa8d04dfb1e27c4b3a82c1b032545e74e2e4

commit f4f4fa8d04dfb1e27c4b3a82c1b032545e74e2e4
Author:     Kyle Evans <kevans@FreeBSD.org>
AuthorDate: 2024-11-13 22:12:42 +0000
Commit:     Kyle Evans <kevans@FreeBSD.org>
CommitDate: 2024-11-22 04:53:43 +0000

    localedata: add some exceptions to utf8proc widths
    
    Hangul Jamo medial vowels and final consonants are reportedly combining
    characters that won't take up any columns on their own and should be
    reported as zero-width, so add an exception for these as well to reflect
    how they work in practice.  This conforms to how other implementations
    (e.g., glibc) treat these characters.
    
    Reviewed by:    bapt (earlier version), jkim
    Sponsored by:   Klara, Inc.
    
    (cherry picked from commit 160c36eae41afa3c4944ed44778c2b48db8fbb77)
---
 tools/tools/locale/tools/getwidths.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/tools/tools/locale/tools/getwidths.c b/tools/tools/locale/tools/getwidths.c
index 2790b8031912..63c62791253f 100644
--- a/tools/tools/locale/tools/getwidths.c
+++ b/tools/tools/locale/tools/getwidths.c
@@ -28,6 +28,21 @@
 
 #include <utf8proc.h>
 
+static int
+width_of(int32_t wc)
+{
+
+	/*
+	 * Hangul Jamo medial vowels and final consonants are more of
+	 * a combining character, and should be considered zero-width.
+	 */
+	if (wc >= 0x1160 && wc <= 0x11ff)
+		return (0);
+
+	/* No override by default, trust utf8proc's width. */
+	return (utf8proc_charwidth(wc));
+}
+
 int
 main(void)
 {
@@ -43,9 +58,10 @@ main(void)
 		wcc = utf8proc_category(wc);
 		if (wcc == UTF8PROC_CATEGORY_CC)
 			continue;
-		wcw = utf8proc_charwidth(wc);
+		wcw = width_of(wc);
 		if (wcw == 1)
 			continue;
+
 		printf("%04X %d\n", wc, wcw);
 	}