svn commit: r225051 - in user/gabor/tre-integration:
contrib/tre/lib include lib/libc/regex
Gabor Kovesdan
gabor at FreeBSD.org
Sun Aug 21 00:51:43 UTC 2011
Author: gabor
Date: Sun Aug 21 00:51:42 2011
New Revision: 225051
URL: http://svn.freebsd.org/changeset/base/225051
Log:
- Add the heuristic matching code. Currently
* it only supports BRE
* the heuristic construction is limited because the fast matching
algorithm is also limited and only allows simple heuristic patterns
* it has a bug and fills in incorrectly the matching offsets
- Even with the above limitations, some searches only require half of the
time with BSD grep.
Added:
user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c (contents, props changed)
user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h (contents, props changed)
Modified:
user/gabor/tre-integration/contrib/tre/lib/regcomp.c
user/gabor/tre-integration/contrib/tre/lib/regexec.c
user/gabor/tre-integration/contrib/tre/lib/tre-compile.c
user/gabor/tre-integration/include/regex.h
user/gabor/tre-integration/lib/libc/regex/Makefile.inc
Modified: user/gabor/tre-integration/contrib/tre/lib/regcomp.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/regcomp.c Sun Aug 21 00:48:03 2011 (r225050)
+++ user/gabor/tre-integration/contrib/tre/lib/regcomp.c Sun Aug 21 00:51:42 2011 (r225051)
@@ -16,6 +16,7 @@
#include <stdlib.h>
#include "tre-fastmatch.h"
+#include "tre-heuristic.h"
#include "tre-internal.h"
#include "xmalloc.h"
@@ -143,7 +144,15 @@ void
tre_regfree(regex_t *preg)
{
if (preg->shortcut != NULL)
- tre_free_fast(preg->shortcut);
+ {
+ tre_free_fast(preg->shortcut);
+ xfree(preg->shortcut);
+ }
+ if (preg->heur != NULL)
+ {
+ tre_free_heur(preg->heur);
+ xfree(preg->heur);
+ }
tre_free(preg);
}
Modified: user/gabor/tre-integration/contrib/tre/lib/regexec.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/regexec.c Sun Aug 21 00:48:03 2011 (r225050)
+++ user/gabor/tre-integration/contrib/tre/lib/regexec.c Sun Aug 21 00:51:42 2011 (r225051)
@@ -46,6 +46,7 @@ char *alloca ();
#include <limits.h>
#include "tre-fastmatch.h"
+#include "tre-heuristic.h"
#include "tre-internal.h"
#include "xmalloc.h"
@@ -151,14 +152,53 @@ tre_have_approx(const regex_t *preg)
static int
tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
- int eflags, fastmatch_t *shortcut)
+ int eflags, fastmatch_t *shortcut, heur_t *heur)
{
reg_errcode_t status;
int *tags = NULL, eo;
- /* Check if we can cheat with a faster algorithm */
+ /* Check if we can cheat with a faster algorithm. */
if (shortcut != NULL)
- return tre_match_fast(shortcut, string, len, type, nmatch, pmatch, eflags);
+ return tre_match_fast(shortcut, string, len, type, nmatch,
+ pmatch, eflags);
+
+ /* Check if we have a heuristic to speed up the search. */
+ if (heur != NULL)
+ {
+ int ret;
+ size_t st = 0, j;
+ const char *data_byte = string;
+ const tre_char_t *data_wide = string;
+ const void *tmp;
+
+ /* Look for the beginning of possibly matching text. */
+ ret = tre_match_fast(heur->start, string, len, type, nmatch,
+ pmatch, eflags);
+ if (ret != REG_OK)
+ return ret;
+ st = pmatch[0].rm_so;
+ j = pmatch[0].rm_eo;
+ string = (type == STR_WIDE) ? (void *)&data_wide[st] :
+ (void *)&data_byte[st];
+
+ /* When having a fixed-length pattern there is only one heuristic. */
+ if (heur->end == NULL)
+ return tre_match(tnfa, string,
+ heur->prefix ? (len - st) : (j - st),
+ type, nmatch, pmatch, eflags, NULL, NULL);
+
+ tmp = (type == STR_WIDE) ? (void *)&data_wide[j] :
+ (void *)&data_byte[j];
+
+ /* Look for the end of possibly matching text. */
+ ret = tre_match_fast(heur->end, tmp, len - j, type, nmatch,
+ pmatch, eflags);
+ if (ret != REG_OK)
+ return ret;
+
+ return tre_match(tnfa, string, pmatch[0].rm_eo + j - st,
+ type, nmatch, pmatch, eflags, NULL, NULL);
+ }
if (tnfa->num_tags > 0 && nmatch > 0)
{
@@ -227,7 +267,7 @@ tre_match(const tre_tnfa_t *tnfa, const
if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0) \
return REG_NOMATCH; \
ret = tre_match(tnfa, &str[offset], slen, type, nmatch, \
- pmatch, eflags, preg->shortcut); \
+ pmatch, eflags, preg->shortcut, preg->heur); \
for (unsigned i = 0; (i == 0) || (!(eflags & REG_NOSUB) && \
(i < nmatch)); i++) \
{ \
@@ -248,7 +288,7 @@ tre_regnexec(const regex_t *preg, const
ADJUST_OFFSETS
else
return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags,
- preg->shortcut);
+ preg->shortcut, preg->heur);
}
int
@@ -272,7 +312,7 @@ tre_regwnexec(const regex_t *preg, const
ADJUST_OFFSETS
else
return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags,
- preg->shortcut);
+ preg->shortcut, preg->heur);
}
int
@@ -290,7 +330,7 @@ tre_reguexec(const regex_t *preg, const
{
tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags,
- preg->shortcut);
+ preg->shortcut, preg->heur);
}
@@ -314,7 +354,7 @@ tre_match_approx(const tre_tnfa_t *tnfa,
if (params.max_cost == 0 && !tnfa->have_approx
&& !(eflags & REG_APPROX_MATCHER))
return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
- eflags, NULL);
+ eflags, NULL, NULL);
/* Back references are not supported by the approximate matcher. */
if (tnfa->have_backrefs)
Modified: user/gabor/tre-integration/contrib/tre/lib/tre-compile.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-compile.c Sun Aug 21 00:48:03 2011 (r225050)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-compile.c Sun Aug 21 00:51:42 2011 (r225051)
@@ -22,6 +22,7 @@
#include <string.h>
#include "tre-fastmatch.h"
+#include "tre-heuristic.h"
#include "tre-internal.h"
#include "tre-mem.h"
#include "tre-stack.h"
@@ -1867,6 +1868,7 @@ tre_compile(regex_t *preg, const tre_cha
reg_errcode_t errcode;
tre_mem_t mem;
fastmatch_t *shortcut;
+ heur_t *heur;
/* Parse context. */
tre_parse_ctx_t parse_ctx;
@@ -2179,6 +2181,28 @@ tre_compile(regex_t *preg, const tre_cha
xfree(offs);
preg->TRE_REGEX_T_FIELD = (void *)tnfa;
+
+ /*
+ * If we reach here, the regex is parsed and legal. Now we try to construct
+ * a heuristic to speed up matching.
+ */
+
+ heur = xmalloc(sizeof(heur_t));
+ if (!heur)
+ {
+ errcode = REG_ESPACE;
+ goto error_exit;
+ }
+
+ ret = tre_compile_heur(heur, regex, n, cflags);
+ if (ret != REG_OK)
+ {
+ xfree(heur);
+ preg->heur = NULL;
+ }
+ else
+ preg->heur = heur;
+
return REG_OK;
error_exit:
Added: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c Sun Aug 21 00:51:42 2011 (r225051)
@@ -0,0 +1,271 @@
+/*-
+ * Copyright (C) 2011 Gabor Kovesdan <gabor at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /* HAVE_CONFIG_H */
+#include <regex.h>
+#include <stdbool.h>
+#include <string.h>
+#ifdef TRE_WCHAR
+#include <wctype.h>
+#endif
+
+#include "tre-fastmatch.h"
+#include "tre-heuristic.h"
+#include "tre-internal.h"
+#include "xmalloc.h"
+
+/*
+ * Parses bracket expression seeking to the end of the enclosed text.
+ * The parameters are the opening (oe) and closing elements (ce).
+ * Can handle nested bracket expressions.
+ */
+#define PARSE_UNIT(oe, ce) \
+ { \
+ int level = 0; \
+ \
+ while (i < len) \
+ { \
+ if (regex[i] == TRE_CHAR(oe)) \
+ level++; \
+ else if (regex[i] == TRE_CHAR(ce)) \
+ level--; \
+ if (level == 0) \
+ break; \
+ i++; \
+ } \
+ }
+
+/*
+ * Finishes a segment (fixed-length text fragment).
+ */
+#define END_SEGMENT \
+ do \
+ { \
+ st = i + 1; \
+ escaped = false; \
+ goto end_segment; \
+ } while (0);
+
+/*
+ * Parses a regular expression and constructs a heuristic in heur_t and
+ * returns REG_OK if successful or the corresponding error code if a
+ * heuristic cannot be constructed.
+ */
+int
+tre_compile_heur(heur_t *h, const tre_char_t *regex, size_t len, int cflags)
+{
+ tre_char_t *heur;
+ int st = 0, pos = 0;
+ bool escaped = false;
+ int errcode, ret;
+
+ /* XXX: only basic regexes are supported. */
+ if (cflags & REG_EXTENDED)
+ return REG_BADPAT;
+
+ /* Temporary space, len will be enough. */
+ heur = xmalloc(len);
+ if (!heur)
+ return REG_ESPACE;
+
+ memset(h, 0, sizeof(*h));
+
+ while (true)
+ {
+
+ /*
+ * Process the pattern char-by-char.
+ *
+ * i: position in regex
+ * st: start offset of current segment (fixed-length fragment)
+ * to be processed
+ * pos: current position (and length) in the temporary space where
+ * we copy the current segment
+ */
+ for (int i = st; i < len; i++)
+ {
+ switch (regex[i])
+ {
+
+ /* Bracketed expression is substituted with a dot. */
+ case TRE_CHAR('['):
+ PARSE_UNIT('[', ']');
+ heur[pos++] = TRE_CHAR('.');
+ continue;
+
+ /*
+ * If a repetition marker, erases the repeting character
+ * and terminates the segment.
+ * Otherwise just terminates the segment (XXX).
+ */
+ case TRE_CHAR('{'):
+ PARSE_UNIT('{', '}');
+ if (escaped)
+ pos--;
+ END_SEGMENT;
+ break;
+
+ /*
+ * Terminates the current segment whether a subexpression
+ * marker or not. (XXX)
+ */
+ case TRE_CHAR('('):
+ PARSE_UNIT('(', ')');
+ END_SEGMENT;
+ break;
+
+ /*
+ * Sets escaped flag.
+ * Escaped escape terminates current segment. (XXX)
+ */
+ case TRE_CHAR('\\'):
+ if (escaped)
+ END_SEGMENT;
+ escaped = !escaped;
+ continue;
+
+ /*
+ * If not the first character, erases the last character
+ * and terminates the segment.
+ * Otherwise heuristic construction fails. (XXX)
+ */
+ case TRE_CHAR('*'):
+ if (i != 0)
+ {
+ pos--;
+ END_SEGMENT;
+ }
+ else
+ goto badpat1;
+ break;
+
+ /*
+ * If a backreference (escaped digit), terminates segment.
+ * Otherwise adds current character to the current segment
+ * by copying it to the temporary space.
+ */
+ default:
+ if (escaped && tre_isdigit(regex[i]))
+ END_SEGMENT;
+ heur[pos++] = regex[i];
+ continue;
+ }
+ }
+
+ /* We are done with the pattern if we got here. */
+ st = len;
+
+end_segment:
+
+ /* If it is not initialized yet, then we just got the first segment. */
+ if (h->start == NULL)
+ {
+
+ /*
+ * An empty or a one-char prefix segment is useless,
+ * better to just fail.
+ */
+ if (pos <= 1)
+ {
+ errcode = REG_BADPAT;
+ goto badpat1;
+ }
+
+ h->start = xmalloc(sizeof(fastmatch_t));
+ if (!h->start)
+ {
+ errcode = REG_ESPACE;
+ goto space1;
+ }
+
+ ret = tre_compile_fast(h->start, heur, pos, 0);
+ if (ret != REG_OK)
+ {
+ errcode = REG_BADPAT;
+ goto badpat2;
+ }
+ }
+
+ /*
+ * If true, this is the last segment. We do not care about the
+ * middle ones.
+ */
+ else if (st == len)
+ {
+
+ /* If empty, we only have a prefix heuristic. */
+ if (pos == 0)
+ {
+ h->prefix = true;
+ errcode = REG_OK;
+ goto ok;
+ }
+
+ h->end = xmalloc(sizeof(fastmatch_t));
+ if (!h->end)
+ {
+ errcode = REG_ESPACE;
+ goto space2;
+ }
+
+ ret = tre_compile_fast(h->end, heur, pos, 0);
+ if (ret != REG_OK)
+ {
+ xfree(h->end);
+ h->prefix = true;
+ }
+ errcode = REG_OK;
+ goto ok;
+ }
+
+ /* Just drop middle segments by overwriting the temporary space. */
+ pos = 0;
+ }
+
+badpat2:
+space2:
+ if (h->start != NULL)
+ xfree(h->start);
+badpat1:
+space1:
+ok:
+ xfree(heur);
+ return errcode;
+}
+
+/*
+ * Frees a heuristic.
+ */
+void
+tre_free_heur(heur_t *h)
+{
+ if (h->start != NULL)
+ xfree(h->start);
+ if (h->end != NULL)
+ xfree(h->end);
+}
Added: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h Sun Aug 21 00:51:42 2011 (r225051)
@@ -0,0 +1,21 @@
+#ifndef TRE_HEURISTIC_H
+#define TRE_HEURISTIC_H 1
+
+#include <fastmatch.h>
+#include <stdbool.h>
+
+#include "tre-fastmatch.h"
+#include "tre-internal.h"
+
+typedef struct {
+ fastmatch_t *start;
+ fastmatch_t *end;
+ bool prefix;
+} heur_t;
+
+
+extern int tre_compile_heur(heur_t *h, const tre_char_t *regex,
+ size_t len, int cflags);
+extern void tre_free_heur(heur_t *h);
+
+#endif /* TRE_HEURISTIC_H */
Modified: user/gabor/tre-integration/include/regex.h
==============================================================================
--- user/gabor/tre-integration/include/regex.h Sun Aug 21 00:48:03 2011 (r225050)
+++ user/gabor/tre-integration/include/regex.h Sun Aug 21 00:51:42 2011 (r225051)
@@ -67,6 +67,7 @@ typedef struct {
size_t re_nsub; /* Number of parenthesized subexpressions. */
void *value; /* For internal use only. */
void *shortcut; /* For internal use only. */
+ void *heur; /* For internal use only. */
const char *re_endp;
} regex_t;
Modified: user/gabor/tre-integration/lib/libc/regex/Makefile.inc
==============================================================================
--- user/gabor/tre-integration/lib/libc/regex/Makefile.inc Sun Aug 21 00:48:03 2011 (r225050)
+++ user/gabor/tre-integration/lib/libc/regex/Makefile.inc Sun Aug 21 00:51:42 2011 (r225051)
@@ -6,7 +6,7 @@
CFLAGS+=-DHAVE_CONFIG_H -DTRE_LIBC_BUILD -I${.CURDIR}/../../contrib/tre
SRCS+= fastmatch.c hashtable.c regcomp.c regerror.c regexec.c tre-ast.c \
- tre-compile.c tre-fastmatch.c tre-match-approx.c \
+ tre-compile.c tre-fastmatch.c tre-heuristic.c tre-match-approx.c \
tre-match-backtrack.c tre-match-parallel.c tre-mem.c tre-parse.c \
tre-stack.c xmalloc.c
More information about the svn-src-user
mailing list