git: af79566b9315 - main - fnmatch: Add support for collating symbols, equivalence classes, and character classes

From: Bojan Novković <bnovkov_at_FreeBSD.org>
Date: Thu, 10 Apr 2025 09:44:54 UTC
The branch main has been updated by bnovkov:

URL: https://cgit.FreeBSD.org/src/commit/?id=af79566b9315dd5251b093b6237a6fa239351398

commit af79566b9315dd5251b093b6237a6fa239351398
Author:     Bojan Novković <bnovkov@FreeBSD.org>
AuthorDate: 2025-04-03 14:57:51 +0000
Commit:     Bojan Novković <bnovkov@FreeBSD.org>
CommitDate: 2025-04-10 09:44:28 +0000

    fnmatch: Add support for collating symbols, equivalence classes, and character classes
    
    This change extends fnmatch to support collating symbol expressions,
    equivalence class expressions, and character class expressions (as
    defined by POSIX.1, section 9.3.5), along with the corresponding
    tests.
    
    Sponsored by:   Klara, Inc.
    Obtained from:  https://github.com/apple-oss-distributions/Libc
    Differential Revision:  https://reviews.freebsd.org/D49660
    Reviewed by:    markj, ziaee (manpages)
---
 lib/libc/gen/fnmatch.3            |   8 +-
 lib/libc/gen/fnmatch.c            | 221 ++++++++++++++++++++++++++++++++++----
 lib/libc/tests/gen/fnmatch_test.c |  81 ++++++++++++++
 3 files changed, 285 insertions(+), 25 deletions(-)

diff --git a/lib/libc/gen/fnmatch.3 b/lib/libc/gen/fnmatch.3
index 804bc968c1ce..7f020fec58e3 100644
--- a/lib/libc/gen/fnmatch.3
+++ b/lib/libc/gen/fnmatch.3
@@ -27,7 +27,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.Dd April 2, 2022
+.Dd April 7, 2025
 .Dt FNMATCH 3
 .Os
 .Sh NAME
@@ -129,12 +129,8 @@ otherwise, it returns the value
 .Sh STANDARDS
 The current implementation of the
 .Fn fnmatch
-function
-.Em does not
-conform to
+function is expected to conform to
 .St -p1003.2 .
-Collating symbol expressions, equivalence class expressions and
-character class expressions are not supported.
 .Sh HISTORY
 A predecessor to
 .Fn fnmatch ,
diff --git a/lib/libc/gen/fnmatch.c b/lib/libc/gen/fnmatch.c
index fb1829e69502..43e7e214c3cb 100644
--- a/lib/libc/gen/fnmatch.c
+++ b/lib/libc/gen/fnmatch.c
@@ -67,7 +67,8 @@
 #define RANGE_NOMATCH   0
 #define RANGE_ERROR     (-1)
 
-static int rangematch(const char *, wchar_t, int, char **, mbstate_t *);
+static int rangematch(const char *, wchar_t, const char *, int, char **,
+    char **, mbstate_t *, mbstate_t *);
 static int fnmatch1(const char *, const char *, const char *, int, mbstate_t,
 		mbstate_t);
 
@@ -85,7 +86,7 @@ fnmatch1(const char *pattern, const char *string, const char *stringstart,
 {
 	const char *bt_pattern, *bt_string;
 	mbstate_t bt_patmbs, bt_strmbs;
-	char *newp;
+	char *newp, *news;
 	char c;
 	wchar_t pc, sc;
 	size_t pclen, sclen;
@@ -164,17 +165,17 @@ fnmatch1(const char *pattern, const char *string, const char *stringstart,
 			    ((flags & FNM_PATHNAME) && *(string - 1) == '/')))
 				goto backtrack;
 
-			switch (rangematch(pattern, sc, flags, &newp,
-			    &patmbs)) {
+			switch (rangematch(pattern, sc, string + sclen, flags,
+			    &newp, &news, &patmbs, &strmbs)) {
 			case RANGE_ERROR:
 				goto norm;
 			case RANGE_MATCH:
 				pattern = newp;
+				string = news;
 				break;
 			case RANGE_NOMATCH:
 				goto backtrack;
 			}
-			string += sclen;
 			break;
 		case '\\':
 			if (!(flags & FNM_NOESCAPE)) {
@@ -218,8 +219,10 @@ fnmatch1(const char *pattern, const char *string, const char *stringstart,
 				if (sc == '/' && flags & FNM_PATHNAME)
 					return (FNM_NOMATCH);
 				bt_string += sclen;
-				pattern = bt_pattern, patmbs = bt_patmbs;
-				string = bt_string, strmbs = bt_strmbs;
+				pattern = bt_pattern;
+				patmbs = bt_patmbs;
+				string = bt_string;
+				strmbs = bt_strmbs;
 			}
 			break;
 		}
@@ -228,15 +231,20 @@ fnmatch1(const char *pattern, const char *string, const char *stringstart,
 }
 
 static int
-rangematch(const char *pattern, wchar_t test, int flags, char **newp,
-    mbstate_t *patmbs)
+rangematch(const char *pattern, wchar_t test, const char *string, int flags,
+    char **newp, char **news, mbstate_t *patmbs, mbstate_t *strmbs)
 {
 	int negate, ok;
 	wchar_t c, c2;
 	size_t pclen;
 	const char *origpat;
 	struct xlocale_collate *table =
-		(struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
+	    (struct xlocale_collate *)__get_locale()->components[XLC_COLLATE];
+	wchar_t buf[COLLATE_STR_LEN];	/* STR_LEN defined in collate.h */
+	const char *cp, *savestring;
+	int special;
+	mbstate_t save;
+	size_t sclen, len;
 
 	/*
 	 * A bracket expression starting with an unquoted circumflex
@@ -259,20 +267,132 @@ rangematch(const char *pattern, wchar_t test, int flags, char **newp,
 	ok = 0;
 	origpat = pattern;
 	for (;;) {
+		c = 0;
 		if (*pattern == ']' && pattern > origpat) {
-			pattern++;
 			break;
 		} else if (*pattern == '\0') {
 			return (RANGE_ERROR);
 		} else if (*pattern == '/' && (flags & FNM_PATHNAME)) {
 			return (RANGE_NOMATCH);
-		} else if (*pattern == '\\' && !(flags & FNM_NOESCAPE))
+		} else if (*pattern == '\\' && !(flags & FNM_NOESCAPE)) {
 			pattern++;
-		pclen = mbrtowc(&c, pattern, MB_LEN_MAX, patmbs);
-		if (pclen == (size_t)-1 || pclen == (size_t)-2)
-			return (RANGE_NOMATCH);
-		pattern += pclen;
+		} else if (*pattern == '[' &&
+		    ((special = *(pattern + 1)) == '.' ||
+		    special == '=' || special == ':')) {
+			cp = (pattern += 2);
+			while ((cp = strchr(cp, special))) {
+				if (*(cp + 1) == ']')
+					break;
+				cp++;
+			}
+			if (!cp)
+				return (RANGE_ERROR);
+			if (special == '.') {
+treat_like_collating_symbol:
+				len = __collate_collating_symbol(buf,
+				    COLLATE_STR_LEN, pattern,
+				    cp - pattern, patmbs);
+				if (len == (size_t)-1 || len == 0)
+					return (RANGE_ERROR);
+				pattern = cp + 2;
+				if (len > 1) {
+					wchar_t *wp, sc;
 
+					/*
+					 * No multi-character collation
+					 * symbols as start of range.
+					 */
+					if (*(cp + 2) == '-' &&
+					    *(cp + 3) != EOS &&
+					    *(cp + 3) != ']')
+						return (RANGE_ERROR);
+					wp = buf;
+					if (test != *wp++)
+						continue;
+					if (len == 1) {
+						ok = 1;
+						break;
+					}
+					memcpy(&save, strmbs, sizeof(save));
+					savestring = string;
+					while (--len > 0) {
+						sclen = mbrtowc(&sc, string,
+						    MB_LEN_MAX, strmbs);
+						if (sclen == (size_t)-1 ||
+						    sclen == (size_t)-2) {
+							sc = (unsigned char)*string;
+							sclen = 1;
+							memset(&strmbs, 0,
+							    sizeof(strmbs));
+						}
+						if (sc != *wp++) {
+							memcpy(strmbs, &save,
+							    sizeof(save));
+							string = savestring;
+							break;
+						}
+						string += sclen;
+					}
+					if (len == 0) {
+						ok = 1;
+						break;
+					}
+					continue; /* no match */
+				}
+				c = *buf;
+			} else if (special == '=') {
+				int ec;
+				memcpy(&save, patmbs, sizeof(save));
+				ec = __collate_equiv_class(pattern,
+				    cp - pattern, patmbs);
+				if (ec < 0)
+					return (RANGE_ERROR);
+				if (ec == 0) {
+					memcpy(patmbs, &save, sizeof(save));
+					goto treat_like_collating_symbol;
+				}
+				pattern = cp + 2;
+				/* no equivalence classes as start of range */
+				if (*(cp + 2) == '-' && *(cp + 3) != EOS &&
+				    *(cp + 3) != ']')
+					return (RANGE_ERROR);
+				len = __collate_equiv_match(ec, NULL, 0, test,
+				    string, strlen(string), strmbs, &sclen);
+				if (len < 0)
+					return (RANGE_ERROR);
+				if (len > 0) {
+					ok = 1;
+					string += sclen;
+					break;
+				}
+				continue;
+			} else { /* special == ':' */
+				wctype_t charclass;
+				char name[CHARCLASS_NAME_MAX + 1];
+				/* no character classes as start of range */
+				if (*(cp + 2) == '-' && *(cp + 3) != EOS &&
+				    *(cp + 3) != ']')
+					return (RANGE_ERROR);
+				/* assume character class names are ascii */
+				if (cp - pattern > CHARCLASS_NAME_MAX)
+					return (RANGE_ERROR);
+				strlcpy(name, pattern, cp - pattern + 1);
+				pattern = cp + 2;
+				if ((charclass = wctype(name)) == 0)
+					return (RANGE_ERROR);
+				if (iswctype(test, charclass)) {
+					ok = 1;
+					break;
+				}
+				continue;
+			}
+		}
+		if (!c) {
+			pclen = mbrtowc(&c, pattern, MB_LEN_MAX, patmbs);
+			if (pclen == (size_t)-1 || pclen == (size_t)-2)
+				return (RANGE_NOMATCH);
+			pattern += pclen;
+		}
 		if (flags & FNM_CASEFOLD)
 			c = towlower(c);
 
@@ -288,6 +408,37 @@ rangematch(const char *pattern, wchar_t test, int flags, char **newp,
 			if (c2 == EOS)
 				return (RANGE_ERROR);
 
+			if ((c2 == '[' && (special = *pattern) == '.') ||
+			    special == '=' || special == ':') {
+
+				/*
+				 * No equivalence classes or character
+				 * classes as end of range.
+				 */
+				if (special == '=' || special == ':')
+					return (RANGE_ERROR);
+				cp = ++pattern;
+				while ((cp = strchr(cp, special))) {
+					if (*(cp + 1) == ']')
+						break;
+					cp++;
+				}
+				if (!cp)
+					return (RANGE_ERROR);
+				len = __collate_collating_symbol(buf,
+				    COLLATE_STR_LEN, pattern,
+				    cp - pattern, patmbs);
+
+				/*
+				 * No multi-character collation symbols
+				 *  as end of range.
+				 */
+				if (len != 1)
+					return (RANGE_ERROR);
+				pattern = cp + 2;
+				c2 = *buf;
+			}
+
 			if (flags & FNM_CASEFOLD)
 				c2 = towlower(c2);
 
@@ -295,12 +446,44 @@ rangematch(const char *pattern, wchar_t test, int flags, char **newp,
 			    c <= test && test <= c2 :
 			       __wcollate_range_cmp(c, test) <= 0
 			    && __wcollate_range_cmp(test, c2) <= 0
-			   )
+			   ) {
 				ok = 1;
-		} else if (c == test)
+				break;
+			}
+		} else if (c == test) {
 			ok = 1;
+			break;
+		}
 	}
 
-	*newp = (char *)pattern;
+	/* go to end of bracket expression */
+	special = 0;
+	while (*pattern != ']') {
+		if (*pattern == 0)
+			return (RANGE_ERROR);
+		if (*pattern == special) {
+			if (*++pattern == ']') {
+				special = 0;
+				pattern++;
+			}
+			continue;
+		}
+		if (!special && *pattern == '[') {
+			special = *++pattern;
+			if (special != '.' && special != '=' && special != ':')
+				special = 0;
+			else
+				pattern++;
+			continue;
+		}
+		pclen = mbrtowc(&c, pattern, MB_LEN_MAX, patmbs);
+		if (pclen == (size_t)-1 || pclen == (size_t)-2)
+			return (RANGE_NOMATCH);
+		pattern += pclen;
+	}
+
+	*newp = (char *)++pattern;
+	*news = (char *)string;
+
 	return (ok == negate ? RANGE_NOMATCH : RANGE_MATCH);
 }
diff --git a/lib/libc/tests/gen/fnmatch_test.c b/lib/libc/tests/gen/fnmatch_test.c
index 6cdf5f2a05fa..0ff7400a4a4f 100644
--- a/lib/libc/tests/gen/fnmatch_test.c
+++ b/lib/libc/tests/gen/fnmatch_test.c
@@ -26,6 +26,7 @@
 
 #include <sys/param.h>
 #include <errno.h>
+#include <locale.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -176,10 +177,90 @@ ATF_TC_BODY(fnmatch_test, tc)
 
 }
 
+ATF_TC(fnmatch_characterclass);
+ATF_TC_HEAD(fnmatch_characterclass, tc)
+{
+	atf_tc_set_md_var(tc, "descr", "Test fnmatch with character classes");
+}
+
+ATF_TC_BODY(fnmatch_characterclass, tc)
+{
+	ATF_CHECK(fnmatch("[[:alnum:]]", "a", 0) == 0);
+	ATF_CHECK(fnmatch("[[:cntrl:]]", "\a", 0) == 0);
+	ATF_CHECK(fnmatch("[[:lower:]]", "a", 0) == 0);
+	ATF_CHECK(fnmatch("[[:space:]]", " ", 0) == 0);
+	ATF_CHECK(fnmatch("[[:alpha:]]", "a", 0) == 0);
+	ATF_CHECK(fnmatch("[[:digit:]]", "0", 0) == 0);
+	ATF_CHECK(fnmatch("[[:print:]]", "a", 0) == 0);
+	ATF_CHECK(fnmatch("[[:upper:]]", "A", 0) == 0);
+	ATF_CHECK(fnmatch("[[:blank:]]", " ", 0) == 0);
+	ATF_CHECK(fnmatch("[[:graph:]]", "a", 0) == 0);
+	ATF_CHECK(fnmatch("[[:punct:]]", ".", 0) == 0);
+	ATF_CHECK(fnmatch("[[:xdigit:]]", "f", 0) == 0);
+
+	/*
+	 * POSIX.1, section 9.3.5. states that '[:' and ':]'
+	 * should be interpreted as character classes symbol only
+	 * when part of a bracket expression.
+	 */
+	ATF_CHECK(fnmatch("[:alnum:]", "a", 0) == 0);
+	ATF_CHECK(fnmatch("[:alnum:]", ":", 0) == 0);
+	ATF_CHECK(fnmatch("[:alnum:]", "1", 0) != 0);
+}
+
+ATF_TC(fnmatch_collsym);
+ATF_TC_HEAD(fnmatch_collsym, tc)
+{
+	atf_tc_set_md_var(tc, "descr", "Test fnmatch with collating symbols");
+}
+
+ATF_TC_BODY(fnmatch_collsym, tc)
+{
+	setlocale(LC_ALL, "cs_CZ.UTF-8");
+	ATF_CHECK(fnmatch("[ch]", "ch", 0) != 0);
+	ATF_CHECK(fnmatch("[[.ch.]]", "ch", 0) == 0);
+	ATF_CHECK(fnmatch("[[.ch.]]h", "chh", 0) == 0);
+
+	/*
+	 * POSIX.1, section 9.3.5. states that '[.' and '.]'
+	 * should be interpreted as a collating symbol only
+	 * when part of a bracket expression.
+	 */
+	ATF_CHECK(fnmatch("[.ch.]", "c", 0) == 0);
+	ATF_CHECK(fnmatch("[.ch.]", "h", 0) == 0);
+	ATF_CHECK(fnmatch("[.ch.]", ".", 0) == 0);
+}
+
+ATF_TC(fnmatch_equivclass);
+ATF_TC_HEAD(fnmatch_equivclass, tc)
+{
+	atf_tc_set_md_var(tc, "descr", "Test fnmatch with equivalence classes");
+}
+
+ATF_TC_BODY(fnmatch_equivclass, tc)
+{
+	setlocale(LC_ALL, "en_US.UTF-8");
+	ATF_CHECK(fnmatch("[[=a=]]b", "ab", 0) == 0);
+	ATF_CHECK(fnmatch("[[=a=]]b", "Ab", 0) == 0);
+	ATF_CHECK(fnmatch("[[=à=]]b", "ab", 0) == 0);
+	ATF_CHECK(fnmatch("[[=a=]]b", "àb", 0) == 0);
+
+	/*
+	 * POSIX.1, section 9.3.5. states that '[=' and '=]'
+	 * should be interpreted as an equivalence class only
+	 * when part of a bracket expression.
+	 */
+	ATF_CHECK(fnmatch("[=a=]b", "=b", 0) == 0);
+	ATF_CHECK(fnmatch("[=a=]b", "ab", 0) == 0);
+}
+
 ATF_TP_ADD_TCS(tp)
 {
 
 	ATF_TP_ADD_TC(tp, fnmatch_test);
+	ATF_TP_ADD_TC(tp, fnmatch_collsym);
+	ATF_TP_ADD_TC(tp, fnmatch_characterclass);
+	ATF_TP_ADD_TC(tp, fnmatch_equivclass);
 
 	return (atf_no_error());
 }