git: b74a756a9f53 - main - collate: Add support for equivalence classes and collating symbols
Date: Thu, 10 Apr 2025 09:44:53 UTC
The branch main has been updated by bnovkov: URL: https://cgit.FreeBSD.org/src/commit/?id=b74a756a9f537c6da41d721075f69216160a08a6 commit b74a756a9f537c6da41d721075f69216160a08a6 Author: Bojan Novković <bnovkov@FreeBSD.org> AuthorDate: 2025-04-03 14:45:08 +0000 Commit: Bojan Novković <bnovkov@FreeBSD.org> CommitDate: 2025-04-10 09:44:23 +0000 collate: Add support for equivalence classes and collating symbols This change adds support for matching single and multi-character equivalence classes and collating symbols, as specified by POSIX1, section 9.3.5. Sponsored by: Klara, Inc. Obtained from: https://github.com/apple-oss-distributions/Libc Differential Revision: https://reviews.freebsd.org/D49659 Reviewed by: markj --- lib/libc/locale/collate.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++ lib/libc/locale/collate.h | 11 +- 2 files changed, 269 insertions(+), 1 deletion(-) diff --git a/lib/libc/locale/collate.c b/lib/libc/locale/collate.c index 8e3635485f10..c0fc4c91481d 100644 --- a/lib/libc/locale/collate.c +++ b/lib/libc/locale/collate.c @@ -43,6 +43,7 @@ #include <sys/mman.h> #include <assert.h> +#include <ctype.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -732,3 +733,261 @@ __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) } return (0); } + +/* + * __collate_collating_symbol takes the multibyte string specified by + * src and slen, and using ps, converts that to a wide character. Then + * it is checked to verify it is a collating symbol, and then copies + * it to the wide character string specified by dst and dlen (the + * results are not null terminated). The length of the wide characters + * copied to dst is returned if successful. Zero is returned if no such + * collating symbol exists. (size_t)-1 is returned if there are wide-character + * conversion errors, if the length of the converted string is greater that + * COLLATE_STR_LEN or if dlen is too small. It is up to the calling routine to + * preserve the mbstate_t structure as needed. + */ +size_t +__collate_collating_symbol(wchar_t *dst, size_t dlen, const char *src, + size_t slen, mbstate_t *ps) +{ + wchar_t wname[COLLATE_STR_LEN]; + wchar_t w, *wp; + struct xlocale_collate *table; + size_t len, l; + + table = + (struct xlocale_collate *)__get_locale()->components[XLC_COLLATE]; + /* POSIX locale */ + if (table->__collate_load_error) { + if (dlen < 1) + return ((size_t)-1); + if (slen != 1 || !isascii(*src)) + return (0); + *dst = *src; + return (1); + } + for (wp = wname, len = 0; slen > 0; len++) { + l = mbrtowc(&w, src, slen, ps); + if (l == (size_t)-1 || l == (size_t)-2) + return ((size_t)-1); + if (l == 0) + break; + if (len >= COLLATE_STR_LEN) + return ((size_t)-1); + *wp++ = w; + src += l; + slen -= l; + } + if (len == 0 || len > dlen) + return ((size_t)-1); + if (len == 1) { + if (*wname <= UCHAR_MAX) { + if (table->char_pri_table[*wname].pri[0] >= 0) { + if (dlen > 0) + *dst = *wname; + return (1); + } + return (0); + } else if (table->info->large_count > 0) { + collate_large_t *match; + match = largesearch(table, *wname); + if (match && match->pri.pri[0] >= 0) { + if (dlen > 0) + *dst = *wname; + return (1); + } + } + return (0); + } + *wp = 0; + if (table->info->chain_count > 0) { + collate_chain_t *match; + int ll; + match = chainsearch(table, wname, &ll); + if (match) { + if (ll < dlen) + dlen = ll; + wcsncpy(dst, wname, dlen); + return (dlen); + } + } + return (0); +} + +/* + * __collate_equiv_class returns the equivalence class number for the symbol + * specified by src and slen, using ps to convert from multi-byte to wide + * character. Zero is returned if the symbol is not in an equivalence + * class. -1 is returned if there are wide character conversion errors, + * if there are any greater-than-8-bit characters or if a multi-byte symbol + * is greater or equal to COLLATE_STR_LEN in length. It is up to the calling + * routine to preserve the mbstate_t structure as needed. + */ +int +__collate_equiv_class(const char *src, size_t slen, mbstate_t *ps) +{ + wchar_t wname[COLLATE_STR_LEN]; + wchar_t w, *wp; + struct xlocale_collate *table; + size_t len, l; + int e; + + table = + (struct xlocale_collate *)__get_locale()->components[XLC_COLLATE]; + /* POSIX locale */ + if (table->__collate_load_error) + return (0); + for (wp = wname, len = 0; slen > 0; len++) { + l = mbrtowc(&w, src, slen, ps); + if (l == (size_t)-1 || l == (size_t)-2) + return (-1); + if (l == 0) + break; + if (len >= COLLATE_STR_LEN) + return (-1); + *wp++ = w; + src += l; + slen -= l; + } + if (len == 0) + return (-1); + if (len == 1) { + e = -1; + if (*wname <= UCHAR_MAX) + e = table->char_pri_table[*wname].pri[0]; + else if (table->info->large_count > 0) { + collate_large_t *match; + match = largesearch(table, *wname); + if (match) + e = match->pri.pri[0]; + } + if (e == 0) + return (IGNORE_EQUIV_CLASS); + return (e > 0 ? e : 0); + } + *wp = 0; + if (table->info->chain_count > 0) { + collate_chain_t *match; + int ll; + match = chainsearch(table, wname, &ll); + if (match) { + e = match->pri[0]; + if (e == 0) + return (IGNORE_EQUIV_CLASS); + return (e < 0 ? -e : e); + } + } + return (0); +} + + +/* + * __collate_equiv_match tries to match any single or multi-character symbol + * in equivalence class equiv_class in the multi-byte string specified by src + * and slen. If start is non-zero, it is taken to be the first (pre-converted) + * wide character. Subsequence wide characters, if needed, will use ps in + * the conversion. On a successful match, the length of the matched string + * is returned (including the start character). If dst is non-NULL, the + * matched wide-character string is copied to dst, a wide character array of + * length dlen (the results are not zero-terminated). If rlen is non-NULL, + * the number of character in src actually used is returned. Zero is + * returned by __collate_equiv_match if there is no match. (size_t)-1 is + * returned on error: if there were conversion errors or if dlen is too small + * to accept the results. On no match or error, ps is restored to its incoming + * state. + */ +size_t +__collate_equiv_match(int equiv_class, wchar_t *dst, size_t dlen, wchar_t start, + const char *src, size_t slen, mbstate_t *ps, size_t *rlen) +{ + wchar_t w; + size_t len, l, clen; + int i; + wchar_t buf[COLLATE_STR_LEN], *wp; + mbstate_t save; + const char *s = src; + struct xlocale_collate *table; + size_t sl = slen; + collate_chain_t *ch = NULL; + + table = + (struct xlocale_collate *)__get_locale()->components[XLC_COLLATE]; + /* POSIX locale */ + if (table->__collate_load_error) + return ((size_t)-1); + if (equiv_class == IGNORE_EQUIV_CLASS) + equiv_class = 0; + if (ps) + save = *ps; + wp = buf; + len = clen = 0; + if (start) { + *wp++ = start; + len = 1; + } + /* convert up to the max chain length */ + while (sl > 0 && len < table->info->chain_max_len) { + l = mbrtowc(&w, s, sl, ps); + if (l == (size_t)-1 || l == (size_t)-2 || l == 0) + break; + *wp++ = w; + s += l; + clen += l; + sl -= l; + len++; + } + *wp = 0; + if (len > 1 && (ch = chainsearch(table, buf, &i)) != NULL) { + int e = ch->pri[0]; + if (e < 0) + e = -e; + if (e == equiv_class) + goto found; + } + /* try single character */ + i = 1; + if (*buf <= UCHAR_MAX) { + if (equiv_class == table->char_pri_table[*buf].pri[0]) + goto found; + } else if (table->info->large_count > 0) { + collate_large_t *match; + match = largesearch(table, *buf); + if (match && equiv_class == match->pri.pri[0]) + goto found; + } + /* no match */ + if (ps) + *ps = save; + return (0); +found: + /* + * If we converted more than we used, restore to initial + * and reconvert up to what did match. + */ + if (i < len) { + len = i; + if (ps) + *ps = save; + if (start) + i--; + clen = 0; + while (i-- > 0) { + l = mbrtowc(&w, src, slen, ps); + src += l; + clen += l; + slen -= l; + } + } + if (dst) { + if (dlen < len) { + if (ps) + *ps = save; + return ((size_t)-1); + } + for (wp = buf; len > 0; len--) + *dst++ = *wp++; + } + if (rlen) + *rlen = clen; + return (len); +} diff --git a/lib/libc/locale/collate.h b/lib/libc/locale/collate.h index f157d8651899..64e0e6f2337d 100644 --- a/lib/libc/locale/collate.h +++ b/lib/libc/locale/collate.h @@ -38,6 +38,7 @@ #include <sys/types.h> #include <limits.h> +#include <wchar.h> #include "xlocale_private.h" /* @@ -65,6 +66,8 @@ #define DIRECTIVE_DIRECTION_MASK (DIRECTIVE_FORWARD | DIRECTIVE_BACKWARD) +#define IGNORE_EQUIV_CLASS 1 + /* * The collate file format is as follows: * @@ -85,6 +88,7 @@ typedef struct collate_info { uint8_t directive_count; uint8_t directive[COLL_WEIGHTS_MAX]; + uint8_t chain_max_len; /* In padding */ int32_t pri_count[COLL_WEIGHTS_MAX]; int32_t flags; int32_t chain_count; @@ -126,8 +130,13 @@ struct xlocale_collate { }; __BEGIN_DECLS -int __collate_load_tables(const char *); +size_t __collate_collating_symbol(wchar_t *, size_t, const char *, size_t, + mbstate_t *); +int __collate_equiv_class(const char *, size_t, mbstate_t *); int __collate_equiv_value(locale_t, const wchar_t *, size_t); +size_t __collate_equiv_match(int, wchar_t *, size_t, wchar_t, const char *, + size_t, mbstate_t *, size_t *); +int __collate_load_tables(const char *); void _collate_lookup(struct xlocale_collate *,const wchar_t *, int *, int *, int, const int **); int __collate_range_cmp(char, char);