svn commit: r224802 - in user/gabor/tre-integration:
contrib/tre/lib include
Gabor Kovesdan
gabor at FreeBSD.org
Fri Aug 12 16:17:15 UTC 2011
Author: gabor
Date: Fri Aug 12 16:17:15 2011
New Revision: 224802
URL: http://svn.freebsd.org/changeset/base/224802
Log:
- Introduce new flag for word-boundary matching: REG_WORD
- Partly recover broken word-boundary matching; rest is TODO
- Macroify fastcomp() and fastcomp_literal() initialization code
- Adjust a comment
Modified:
user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
user/gabor/tre-integration/contrib/tre/lib/tre.h
user/gabor/tre-integration/include/regex.h
Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/fastmatch.c Fri Aug 12 15:13:06 2011 (r224801)
+++ user/gabor/tre-integration/contrib/tre/lib/fastmatch.c Fri Aug 12 16:17:15 2011 (r224802)
@@ -332,6 +332,18 @@ static int fastcmp(const void *, const v
memcpy(p, pat, l * sizeof(tre_char_t)); \
p[l] = TRE_CHAR('\0');
+#define INIT_COMP \
+ /* Initialize. */ \
+ memset(fg, 0, sizeof(*fg)); \
+ fg->icase = (cflags & REG_ICASE); \
+ fg->word = (cflags & REG_WORD); \
+ \
+ /* Cannot handle REG_ICASE with MB string */ \
+ if (fg->icase && (MB_CUR_MAX > 1)) \
+ return REG_BADPAT; \
+ \
+ /* Calculate length if unspecified */ \
+ n = (n == 0) ? tre_strlen(pat) : n;
/*
* Returns: REG_OK on success, error code otherwise
@@ -340,12 +352,10 @@ int
tre_fastcomp_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
int cflags)
{
- /* Initialize. */
- memset(fg, 0, sizeof(*fg));
- fg->icase = (cflags & REG_ICASE);
+ INIT_COMP;
- /* Cannot handle REG_ICASE with MB string */
- if (fg->icase && (MB_CUR_MAX > 1))
+ /* Cannot handle word boundaries with MB string */
+ if (fg->word && (MB_CUR_MAX > 1))
return REG_BADPAT;
#ifdef TRE_WCHAR
@@ -372,15 +382,7 @@ int
tre_fastcomp(fastmatch_t *fg, const tre_char_t *pat, size_t n,
int cflags)
{
- /* Initialize. */
- memset(fg, 0, sizeof(*fg));
- fg->icase = (cflags & REG_ICASE);
-
- /* Cannot handle REG_ICASE with MB string */
- if (fg->icase && (MB_CUR_MAX > 1))
- return REG_BADPAT;
-
- n = (n == 0) ? tre_strlen(pat) : n;
+ INIT_COMP;
/* Remove end-of-line character ('$'). */
if ((n > 0) && (pat[n - 1] == TRE_CHAR('$')))
@@ -408,6 +410,10 @@ tre_fastcomp(fastmatch_t *fg, const tre_
fg->word = true;
}
+ /* Cannot handle word boundaries with MB string */
+ if (fg->word && (MB_CUR_MAX > 1))
+ return REG_BADPAT;
+
/* Look for ways to cheat...er...avoid the full regex engine. */
for (unsigned int i = 0; i < n; i++)
{
@@ -445,6 +451,34 @@ tre_fastcomp(fastmatch_t *fg, const tre_
return REG_OK;
}
+#define CHECK_WORD_BOUNDARY \
+ { \
+ bool bbound, ebound; \
+ \
+ switch (type) \
+ { \
+ case STR_WIDE: \
+ bbound = (j == 0) || !(tre_isalnum(str_wide[j - 1]) || \
+ (str_wide[j - 1] == TRE_CHAR('_'))); \
+ ebound = (j + fg->wlen == len) || \
+ !(tre_isalnum(str_wide[j + fg->wlen]) || \
+ (str_wide[j + fg->wlen] == TRE_CHAR('_'))); \
+ break; \
+ default: \
+ bbound = (j == 0) || !(tre_isalnum(str_byte[j - 1]) || \
+ (str_byte[j - 1] == '_')); \
+ ebound = (j + fg->len == len) || \
+ !(tre_isalnum(str_byte[j + fg->len]) || \
+ (str_byte[j + fg->len] == '_')); \
+ } \
+ if (!bbound || !ebound) \
+ { \
+ shift = 1; \
+ j += shift; \
+ continue; \
+ } \
+ }
+
/*
* Executes matching of the precompiled pattern on the input string.
* Returns REG_OK or REG_NOMATCH depending on if we find a match or not.
@@ -485,6 +519,7 @@ tre_fastexec(const fastmatch_t *fg, cons
shift = fg->len;
}
+ /* XXX: Fix with word boundaries */
/* Only try once at the beginning or ending of the line. */
if (fg->bol || fg->eol)
{
@@ -506,7 +541,7 @@ tre_fastexec(const fastmatch_t *fg, cons
}
else
{
- /* Quick Search algorithm. */
+ /* Quick Search / Turbo Boyer-Moore algorithm. */
j = 0;
do
{
@@ -514,6 +549,8 @@ tre_fastexec(const fastmatch_t *fg, cons
COMPARE;
if (mismatch == REG_OK)
{
+ if (fg->word)
+ CHECK_WORD_BOUNDARY;
pmatch[0].rm_so = j;
pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
return REG_OK;
Modified: user/gabor/tre-integration/contrib/tre/lib/tre.h
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre.h Fri Aug 12 15:13:06 2011 (r224801)
+++ user/gabor/tre-integration/contrib/tre/lib/tre.h Fri Aug 12 16:17:15 2011 (r224802)
@@ -90,6 +90,7 @@ typedef enum {
#define REG_UNGREEDY (REG_RIGHT_ASSOC << 1)
#define REG_PEND (REG_UNGREEDY << 1)
#define REG_GNU (REG_PEND << 1)
+#define REG_WORD (REG_GNU << 1)
/* POSIX tre_regexec() flags. */
#define REG_NOTBOL 1
Modified: user/gabor/tre-integration/include/regex.h
==============================================================================
--- user/gabor/tre-integration/include/regex.h Fri Aug 12 15:13:06 2011 (r224801)
+++ user/gabor/tre-integration/include/regex.h Fri Aug 12 16:17:15 2011 (r224802)
@@ -82,6 +82,7 @@ typedef enum {
#define REG_UNGREEDY (REG_RIGHT_ASSOC << 1)
#define REG_PEND (REG_UNGREEDY << 1)
#define REG_GNU (REG_PEND << 1)
+#define REG_WORD (REG_GNU << 1)
/* POSIX tre_regexec() flags. */
#define REG_NOTBOL 1
More information about the svn-src-user
mailing list