summaryrefslogtreecommitdiffstats
path: root/gnu/lib
diff options
context:
space:
mode:
authortjr <tjr@FreeBSD.org>2004-07-04 01:03:02 +0000
committertjr <tjr@FreeBSD.org>2004-07-04 01:03:02 +0000
commit5f9e4022f9e75041ea54067801e73f18ffab63ba (patch)
tree38d6f0c6303600623102f00fe0513f7022e84cdf /gnu/lib
parent15d9c0f83966da644ce0286d651be247cfdd61ae (diff)
downloadFreeBSD-src-5f9e4022f9e75041ea54067801e73f18ffab63ba.zip
FreeBSD-src-5f9e4022f9e75041ea54067801e73f18ffab63ba.tar.gz
Import of GNU regex from GNU grep 2.5.1 (trimmed).
Diffstat (limited to 'gnu/lib')
-rw-r--r--gnu/lib/libregex/posix/regex.h48
-rw-r--r--gnu/lib/libregex/regex.c2847
2 files changed, 2461 insertions, 434 deletions
diff --git a/gnu/lib/libregex/posix/regex.h b/gnu/lib/libregex/posix/regex.h
index d88ab92..63c2fef 100644
--- a/gnu/lib/libregex/posix/regex.h
+++ b/gnu/lib/libregex/posix/regex.h
@@ -1,6 +1,6 @@
/* Definitions for data structures and routines for the regular
expression library, version 0.12.
- Copyright (C) 1985,89,90,91,92,93,95,96,97,98 Free Software Foundation, Inc.
+ Copyright (C) 1985,1989-1993,1995-1998, 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library. Its master source is NOT part of
the C library, however. The master source lives in /gd/gnu/lib.
@@ -160,6 +160,11 @@ typedef unsigned long int reg_syntax_t;
this bit set, and it won't affect anything in the normal case. */
#define RE_DEBUG (RE_NO_GNU_OPS << 1)
+/* If this bit is set, a syntactically invalid interval is treated as
+ a string of ordinary characters. For example, the ERE 'a{1' is
+ treated as 'a\{1'. */
+#define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1)
+
/* This global variable defines the particular regexp syntax to use (for
some interfaces). When a regexp is compiled, the syntax used is
stored in the pattern buffer, so changing this does not affect
@@ -199,7 +204,8 @@ extern reg_syntax_t re_syntax_options;
| RE_NO_BK_VBAR)
#define RE_SYNTAX_POSIX_EGREP \
- (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES)
+ (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES \
+ | RE_INVALID_INTERVAL_ORD)
/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC
@@ -221,13 +227,13 @@ extern reg_syntax_t re_syntax_options;
(_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS)
#define RE_SYNTAX_POSIX_EXTENDED \
- (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
- | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \
- | RE_NO_BK_PARENS | RE_NO_BK_VBAR \
- | RE_UNMATCHED_RIGHT_PAREN_ORD)
+ (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
+ | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \
+ | RE_NO_BK_PARENS | RE_NO_BK_VBAR \
+ | RE_CONTEXT_INVALID_OPS | RE_UNMATCHED_RIGHT_PAREN_ORD)
-/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
- replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
+/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INDEP_OPS is
+ removed and RE_NO_BK_REFS is added. */
#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \
(_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
| RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \
@@ -513,13 +519,31 @@ extern int re_exec _RE_ARGS ((const char *));
# endif
#endif
+/* GCC 2.95 and later have "__restrict"; C99 compilers have
+ "restrict", and "configure" may have defined "restrict". */
+#ifndef __restrict
+# if ! (2 < __GNUC__ || (2 == __GNUC__ && 95 <= __GNUC_MINOR__))
+# if defined restrict || 199901L <= __STDC_VERSION__
+# define __restrict restrict
+# else
+# define __restrict
+# endif
+# endif
+#endif
+/* For now unconditionally define __restrict_arr to expand to nothing.
+ Ideally we would have a test for the compiler which allows defining
+ it to restrict. */
+#define __restrict_arr
+
/* POSIX compatibility. */
-extern int regcomp _RE_ARGS ((regex_t *__preg, const char *__pattern,
+extern int regcomp _RE_ARGS ((regex_t *__restrict __preg,
+ const char *__restrict __pattern,
int __cflags));
-extern int regexec _RE_ARGS ((const regex_t *__preg,
- const char *__string, size_t __nmatch,
- regmatch_t __pmatch[], int __eflags));
+extern int regexec _RE_ARGS ((const regex_t *__restrict __preg,
+ const char *__restrict __string, size_t __nmatch,
+ regmatch_t __pmatch[__restrict_arr],
+ int __eflags));
extern size_t regerror _RE_ARGS ((int __errcode, const regex_t *__preg,
char *__errbuf, size_t __errbuf_size));
diff --git a/gnu/lib/libregex/regex.c b/gnu/lib/libregex/regex.c
index 640e3fe..358071b 100644
--- a/gnu/lib/libregex/regex.c
+++ b/gnu/lib/libregex/regex.c
@@ -2,7 +2,7 @@
version 0.12.
(Implements POSIX draft P1003.2/D11.2, except for some of the
internationalization features.)
- Copyright (C) 1993, 94, 95, 96, 97, 98, 99 Free Software Foundation, Inc.
+ Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
@@ -56,6 +56,30 @@
# include <wctype.h>
#endif
+/* This is for multi byte string support. */
+#ifdef MBS_SUPPORT
+# define CHAR_TYPE wchar_t
+# define US_CHAR_TYPE wchar_t/* unsigned character type */
+# define COMPILED_BUFFER_VAR wc_buffer
+# define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
+# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_TYPE)+1)
+# define PUT_CHAR(c) \
+ do { \
+ if (MB_CUR_MAX == 1) \
+ putchar (c); \
+ else \
+ printf ("%C", (wint_t) c); /* Should we use wide stream?? */ \
+ } while (0)
+# define TRUE 1
+# define FALSE 0
+#else
+# define CHAR_TYPE char
+# define US_CHAR_TYPE unsigned char /* unsigned character type */
+# define COMPILED_BUFFER_VAR bufp->buffer
+# define OFFSET_ADDRESS_SIZE 2
+# define PUT_CHAR(c) putchar (c)
+#endif /* MBS_SUPPORT */
+
#ifdef _LIBC
/* We have to keep the namespace clean. */
# define regfree(preg) __regfree (preg)
@@ -78,19 +102,30 @@
__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
-#define btowc __btowc
-#endif
+# define btowc __btowc
+/* We are also using some library internals. */
+# include <locale/localeinfo.h>
+# include <locale/elem-hash.h>
+# include <langinfo.h>
+# include <locale/coll-lookup.h>
+#endif
-#ifndef _
-/* This is for other GNU distributions with internationalized messages.
- When compiling libc, the _ and N_ macros are predefined. */
-# ifdef HAVE_LIBINTL_H
-# include <libintl.h>
-# else
-# define gettext(msgid) (msgid)
+/* This is for other GNU distributions with internationalized messages. */
+#if HAVE_LIBINTL_H || defined _LIBC
+# include <libintl.h>
+# ifdef _LIBC
+# undef gettext
+# define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
# endif
-# define N_(msgid) (msgid)
+#else
+# define gettext(msgid) (msgid)
+#endif
+
+#ifndef gettext_noop
+/* This define is so xgettext can find the internationalizable
+ strings. */
+# define gettext_noop(String) String
#endif
/* The `emacs' switch turns on certain matching commands
@@ -164,6 +199,14 @@ char *realloc ();
# endif
#endif /* not emacs */
+
+#if defined _LIBC || HAVE_LIMITS_H
+# include <limits.h>
+#endif
+
+#ifndef MB_LEN_MAX
+# define MB_LEN_MAX 1
+#endif
/* Get the interface, including the syntax bits. */
#include <regex.h>
@@ -247,6 +290,8 @@ extern char *re_syntax_table;
static char re_syntax_table[CHAR_SET_SIZE];
+static void init_syntax_once PARAMS ((void));
+
static void
init_syntax_once ()
{
@@ -268,7 +313,7 @@ init_syntax_once ()
# endif /* not SYNTAX_TABLE */
-# define SYNTAX(c) re_syntax_table[((c) & 0xFF)]
+# define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
#endif /* emacs */
@@ -396,6 +441,11 @@ typedef enum
/* Followed by one byte giving n, then by n literal bytes. */
exactn,
+#ifdef MBS_SUPPORT
+ /* Same as exactn, but contains binary data. */
+ exactn_bin,
+#endif
+
/* Matches any (more or less) character. */
anychar,
@@ -405,6 +455,13 @@ typedef enum
are ordered low-bit-first. A character is in the set if its
bit is 1. A character too large to have a bit in the map is
automatically not in the set. */
+ /* ifdef MBS_SUPPORT, following element is length of character
+ classes, length of collating symbols, length of equivalence
+ classes, length of character ranges, and length of characters.
+ Next, character class element, collating symbols elements,
+ equivalence class elements, range elements, and character
+ elements follow.
+ See regex_compile function. */
charset,
/* Same parameters as charset, but match any character that is
@@ -454,6 +511,7 @@ typedef enum
/* Followed by two-byte relative address of place to resume at
in case of failure. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
on_failure_jump,
/* Like on_failure_jump, but pushes a placeholder instead of the
@@ -462,6 +520,7 @@ typedef enum
/* Throw away latest failure point and then jump to following
two-byte relative address. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
pop_failure_jump,
/* Change to pop_failure_jump if know won't have to backtrack to
@@ -471,6 +530,7 @@ typedef enum
sure that there is no use backtracking out of repetitions
already matched, then we change it to a pop_failure_jump.
Followed by two-byte address. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
maybe_pop_jump,
/* Jump to following two-byte address, and push a dummy failure
@@ -478,6 +538,7 @@ typedef enum
is made to use it for a failure. A `+' construct makes this
before the first repeat. Also used as an intermediary kind
of jump when compiling an alternative. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
dummy_failure_jump,
/* Push a dummy failure point and continue. Used at the end of
@@ -486,15 +547,18 @@ typedef enum
/* Followed by two-byte relative address and two-byte number n.
After matching N times, jump to the address upon failure. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
succeed_n,
/* Followed by two-byte relative address, and two-byte number n.
Jump to the address N times, then fail. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
jump_n,
/* Set the following two-byte relative address to the
subsequent two-byte number. The address *includes* the two
bytes of number. */
+ /* ifdef MBS_SUPPORT, the size of address is 1. */
set_number_at,
wordchar, /* Matches any word-constituent character. */
@@ -523,42 +587,63 @@ typedef enum
/* Common operations on the compiled pattern. */
/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
+/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
-#define STORE_NUMBER(destination, number) \
+#ifdef MBS_SUPPORT
+# define STORE_NUMBER(destination, number) \
+ do { \
+ *(destination) = (US_CHAR_TYPE)(number); \
+ } while (0)
+#else
+# define STORE_NUMBER(destination, number) \
do { \
(destination)[0] = (number) & 0377; \
(destination)[1] = (number) >> 8; \
} while (0)
+#endif /* MBS_SUPPORT */
/* Same as STORE_NUMBER, except increment DESTINATION to
the byte after where the number is stored. Therefore, DESTINATION
must be an lvalue. */
+/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
#define STORE_NUMBER_AND_INCR(destination, number) \
do { \
STORE_NUMBER (destination, number); \
- (destination) += 2; \
+ (destination) += OFFSET_ADDRESS_SIZE; \
} while (0)
/* Put into DESTINATION a number stored in two contiguous bytes starting
at SOURCE. */
+/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
-#define EXTRACT_NUMBER(destination, source) \
+#ifdef MBS_SUPPORT
+# define EXTRACT_NUMBER(destination, source) \
+ do { \
+ (destination) = *(source); \
+ } while (0)
+#else
+# define EXTRACT_NUMBER(destination, source) \
do { \
(destination) = *(source) & 0377; \
(destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
} while (0)
+#endif
#ifdef DEBUG
-static void extract_number _RE_ARGS ((int *dest, unsigned char *source));
+static void extract_number _RE_ARGS ((int *dest, US_CHAR_TYPE *source));
static void
extract_number (dest, source)
int *dest;
- unsigned char *source;
+ US_CHAR_TYPE *source;
{
+#ifdef MBS_SUPPORT
+ *dest = *source;
+#else
int temp = SIGN_EXTEND_CHAR (*(source + 1));
*dest = *source & 0377;
*dest += temp << 8;
+#endif
}
# ifndef EXTRACT_MACROS /* To debug the macros. */
@@ -574,19 +659,19 @@ extract_number (dest, source)
#define EXTRACT_NUMBER_AND_INCR(destination, source) \
do { \
EXTRACT_NUMBER (destination, source); \
- (source) += 2; \
+ (source) += OFFSET_ADDRESS_SIZE; \
} while (0)
#ifdef DEBUG
static void extract_number_and_incr _RE_ARGS ((int *destination,
- unsigned char **source));
+ US_CHAR_TYPE **source));
static void
extract_number_and_incr (destination, source)
int *destination;
- unsigned char **source;
+ US_CHAR_TYPE **source;
{
extract_number (destination, *source);
- *source += 2;
+ *source += OFFSET_ADDRESS_SIZE;
}
# ifndef EXTRACT_MACROS
@@ -660,13 +745,13 @@ print_fastmap (fastmap)
void
print_partial_compiled_pattern (start, end)
- unsigned char *start;
- unsigned char *end;
+ US_CHAR_TYPE *start;
+ US_CHAR_TYPE *end;
{
int mcnt, mcnt2;
- unsigned char *p1;
- unsigned char *p = start;
- unsigned char *pend = end;
+ US_CHAR_TYPE *p1;
+ US_CHAR_TYPE *p = start;
+ US_CHAR_TYPE *pend = end;
if (start == NULL)
{
@@ -677,7 +762,11 @@ print_partial_compiled_pattern (start, end)
/* Loop over pattern commands. */
while (p < pend)
{
- printf ("%d:\t", p - start);
+#ifdef _LIBC
+ printf ("%td:\t", p - start);
+#else
+ printf ("%ld:\t", (long int) (p - start));
+#endif
switch ((re_opcode_t) *p++)
{
@@ -691,23 +780,35 @@ print_partial_compiled_pattern (start, end)
do
{
putchar ('/');
- putchar (*p++);
+ PUT_CHAR (*p++);
+ }
+ while (--mcnt);
+ break;
+
+#ifdef MBS_SUPPORT
+ case exactn_bin:
+ mcnt = *p++;
+ printf ("/exactn_bin/%d", mcnt);
+ do
+ {
+ printf("/%lx", (long int) *p++);
}
while (--mcnt);
break;
+#endif /* MBS_SUPPORT */
case start_memory:
mcnt = *p++;
- printf ("/start_memory/%d/%d", mcnt, *p++);
+ printf ("/start_memory/%d/%ld", mcnt, (long int) *p++);
break;
case stop_memory:
mcnt = *p++;
- printf ("/stop_memory/%d/%d", mcnt, *p++);
+ printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++);
break;
case duplicate:
- printf ("/duplicate/%d", *p++);
+ printf ("/duplicate/%ld", (long int) *p++);
break;
case anychar:
@@ -717,6 +818,51 @@ print_partial_compiled_pattern (start, end)
case charset:
case charset_not:
{
+#ifdef MBS_SUPPORT
+ int i, length;
+ wchar_t *workp = p;
+ printf ("/charset [%s",
+ (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
+ p += 5;
+ length = *workp++; /* the length of char_classes */
+ for (i=0 ; i<length ; i++)
+ printf("[:%lx:]", (long int) *p++);
+ length = *workp++; /* the length of collating_symbol */
+ for (i=0 ; i<length ;)
+ {
+ printf("[.");
+ while(*p != 0)
+ PUT_CHAR((i++,*p++));
+ i++,p++;
+ printf(".]");
+ }
+ length = *workp++; /* the length of equivalence_class */
+ for (i=0 ; i<length ;)
+ {
+ printf("[=");
+ while(*p != 0)
+ PUT_CHAR((i++,*p++));
+ i++,p++;
+ printf("=]");
+ }
+ length = *workp++; /* the length of char_range */
+ for (i=0 ; i<length ; i++)
+ {
+ wchar_t range_start = *p++;
+ wchar_t range_end = *p++;
+ if (MB_CUR_MAX == 1)
+ printf("%c-%c", (char) range_start, (char) range_end);
+ else
+ printf("%C-%C", (wint_t) range_start, (wint_t) range_end);
+ }
+ length = *workp++; /* the length of char */
+ for (i=0 ; i<length ; i++)
+ if (MB_CUR_MAX == 1)
+ putchar (*p++);
+ else
+ printf("%C", (wint_t) *p++);
+ putchar (']');
+#else
register int c, last = -100;
register int in_range = 0;
@@ -754,6 +900,7 @@ print_partial_compiled_pattern (start, end)
putchar (']');
p += 1 + *p;
+#endif /* MBS_SUPPORT */
}
break;
@@ -767,17 +914,30 @@ print_partial_compiled_pattern (start, end)
case on_failure_jump:
extract_number_and_incr (&mcnt, &p);
- printf ("/on_failure_jump to %d", p + mcnt - start);
+#ifdef _LIBC
+ printf ("/on_failure_jump to %td", p + mcnt - start);
+#else
+ printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
+#endif
break;
case on_failure_keep_string_jump:
extract_number_and_incr (&mcnt, &p);
- printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
+#ifdef _LIBC
+ printf ("/on_failure_keep_string_jump to %td", p + mcnt - start);
+#else
+ printf ("/on_failure_keep_string_jump to %ld",
+ (long int) (p + mcnt - start));
+#endif
break;
case dummy_failure_jump:
extract_number_and_incr (&mcnt, &p);
- printf ("/dummy_failure_jump to %d", p + mcnt - start);
+#ifdef _LIBC
+ printf ("/dummy_failure_jump to %td", p + mcnt - start);
+#else
+ printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
+#endif
break;
case push_dummy_failure:
@@ -786,29 +946,50 @@ print_partial_compiled_pattern (start, end)
case maybe_pop_jump:
extract_number_and_incr (&mcnt, &p);
- printf ("/maybe_pop_jump to %d", p + mcnt - start);
+#ifdef _LIBC
+ printf ("/maybe_pop_jump to %td", p + mcnt - start);
+#else
+ printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
+#endif
break;
case pop_failure_jump:
extract_number_and_incr (&mcnt, &p);
- printf ("/pop_failure_jump to %d", p + mcnt - start);
+#ifdef _LIBC
+ printf ("/pop_failure_jump to %td", p + mcnt - start);
+#else
+ printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
+#endif
break;
case jump_past_alt:
extract_number_and_incr (&mcnt, &p);
- printf ("/jump_past_alt to %d", p + mcnt - start);
+#ifdef _LIBC
+ printf ("/jump_past_alt to %td", p + mcnt - start);
+#else
+ printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
+#endif
break;
case jump:
extract_number_and_incr (&mcnt, &p);
- printf ("/jump to %d", p + mcnt - start);
+#ifdef _LIBC
+ printf ("/jump to %td", p + mcnt - start);
+#else
+ printf ("/jump to %ld", (long int) (p + mcnt - start));
+#endif
break;
case succeed_n:
extract_number_and_incr (&mcnt, &p);
p1 = p + mcnt;
extract_number_and_incr (&mcnt2, &p);
- printf ("/succeed_n to %d, %d times", p1 - start, mcnt2);
+#ifdef _LIBC
+ printf ("/succeed_n to %td, %d times", p1 - start, mcnt2);
+#else
+ printf ("/succeed_n to %ld, %d times",
+ (long int) (p1 - start), mcnt2);
+#endif
break;
case jump_n:
@@ -822,7 +1003,12 @@ print_partial_compiled_pattern (start, end)
extract_number_and_incr (&mcnt, &p);
p1 = p + mcnt;
extract_number_and_incr (&mcnt2, &p);
- printf ("/set_number_at location %d to %d", p1 - start, mcnt2);
+#ifdef _LIBC
+ printf ("/set_number_at location %td to %d", p1 - start, mcnt2);
+#else
+ printf ("/set_number_at location %ld to %d",
+ (long int) (p1 - start), mcnt2);
+#endif
break;
case wordbound:
@@ -839,6 +1025,7 @@ print_partial_compiled_pattern (start, end)
case wordend:
printf ("/wordend");
+ break;
# ifdef emacs
case before_dot:
@@ -883,13 +1070,17 @@ print_partial_compiled_pattern (start, end)
break;
default:
- printf ("?%d", *(p-1));
+ printf ("?%ld", (long int) *(p-1));
}
putchar ('\n');
}
- printf ("%d:\tend of pattern.\n", p - start);
+#ifdef _LIBC
+ printf ("%td:\tend of pattern.\n", p - start);
+#else
+ printf ("%ld:\tend of pattern.\n", (long int) (p - start));
+#endif
}
@@ -897,9 +1088,10 @@ void
print_compiled_pattern (bufp)
struct re_pattern_buffer *bufp;
{
- unsigned char *buffer = bufp->buffer;
+ US_CHAR_TYPE *buffer = (US_CHAR_TYPE*) bufp->buffer;
- print_partial_compiled_pattern (buffer, buffer + bufp->used);
+ print_partial_compiled_pattern (buffer, buffer
+ + bufp->used / sizeof(US_CHAR_TYPE));
printf ("%ld bytes used/%ld bytes allocated.\n",
bufp->used, bufp->allocated);
@@ -909,7 +1101,11 @@ print_compiled_pattern (bufp)
print_fastmap (bufp->fastmap);
}
- printf ("re_nsub: %d\t", bufp->re_nsub);
+#ifdef _LIBC
+ printf ("re_nsub: %Zd\t", bufp->re_nsub);
+#else
+ printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
+#endif
printf ("regs_alloc: %d\t", bufp->regs_allocated);
printf ("can_be_null: %d\t", bufp->can_be_null);
printf ("newline_anchor: %d\n", bufp->newline_anchor);
@@ -923,9 +1119,9 @@ print_compiled_pattern (bufp)
void
print_double_string (where, string1, size1, string2, size2)
- const char *where;
- const char *string1;
- const char *string2;
+ const CHAR_TYPE *where;
+ const CHAR_TYPE *string1;
+ const CHAR_TYPE *string2;
int size1;
int size2;
{
@@ -938,13 +1134,13 @@ print_double_string (where, string1, size1, string2, size2)
if (FIRST_STRING_P (where))
{
for (this_char = where - string1; this_char < size1; this_char++)
- putchar (string1[this_char]);
+ PUT_CHAR (string1[this_char]);
where = string2;
}
for (this_char = where - string2; this_char < size2; this_char++)
- putchar (string2[this_char]);
+ PUT_CHAR (string2[this_char]);
}
}
@@ -970,6 +1166,92 @@ printchar (c)
#endif /* not DEBUG */
+#ifdef MBS_SUPPORT
+/* This convert a multibyte string to a wide character string.
+ And write their correspondances to offset_buffer(see below)
+ and write whether each wchar_t is binary data to is_binary.
+ This assume invalid multibyte sequences as binary data.
+ We assume offset_buffer and is_binary is already allocated
+ enough space. */
+
+static size_t convert_mbs_to_wcs (CHAR_TYPE *dest, const unsigned char* src,
+ size_t len, int *offset_buffer,
+ char *is_binary);
+static size_t
+convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
+ CHAR_TYPE *dest;
+ const unsigned char* src;
+ size_t len; /* the length of multibyte string. */
+
+ /* It hold correspondances between src(char string) and
+ dest(wchar_t string) for optimization.
+ e.g. src = "xxxyzz"
+ dest = {'X', 'Y', 'Z'}
+ (each "xxx", "y" and "zz" represent one multibyte character
+ corresponding to 'X', 'Y' and 'Z'.)
+ offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
+ = {0, 3, 4, 6}
+ */
+ int *offset_buffer;
+ char *is_binary;
+{
+ wchar_t *pdest = dest;
+ const unsigned char *psrc = src;
+ size_t wc_count = 0;
+
+ if (MB_CUR_MAX == 1)
+ { /* We don't need conversion. */
+ for ( ; wc_count < len ; ++wc_count)
+ {
+ *pdest++ = *psrc++;
+ is_binary[wc_count] = FALSE;
+ offset_buffer[wc_count] = wc_count;
+ }
+ offset_buffer[wc_count] = wc_count;
+ }
+ else
+ {
+ /* We need conversion. */
+ mbstate_t mbs;
+ int consumed;
+ size_t mb_remain = len;
+ size_t mb_count = 0;
+
+ /* Initialize the conversion state. */
+ memset (&mbs, 0, sizeof (mbstate_t));
+
+ offset_buffer[0] = 0;
+ for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
+ psrc += consumed)
+ {
+ consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
+
+ if (consumed <= 0)
+ /* failed to convert. maybe src contains binary data.
+ So we consume 1 byte manualy. */
+ {
+ *pdest = *psrc;
+ consumed = 1;
+ is_binary[wc_count] = TRUE;
+ }
+ else
+ is_binary[wc_count] = FALSE;
+ /* In sjis encoding, we use yen sign as escape character in
+ place of reverse solidus. So we convert 0x5c(yen sign in
+ sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
+ solidus in UCS2). */
+ if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
+ *pdest = (wchar_t) *psrc;
+
+ offset_buffer[wc_count + 1] = mb_count += consumed;
+ }
+ }
+
+ return wc_count;
+}
+
+#endif /* MBS_SUPPORT */
+
/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
also be assigned to arbitrarily: each pattern buffer stores its own
syntax, so it can be changed between regex compilations. */
@@ -1009,68 +1291,59 @@ weak_alias (__re_set_syntax, re_set_syntax)
POSIX doesn't require that we do anything for REG_NOERROR,
but why not be nice? */
-#if 0
- /* This section is for xgettext; it sees the strings wrapped inside
- N_() and marks them as needing translation. They should match
- the strings in re_error_msgid. We can't use the usual string
- concatenation trick to initialize re_error_msgid, since other GNU
- distributions use this file with traditional C, and traditional C
- lacks string concatenation. */
- N_("Success") /* REG_NOERROR */
- N_("No match") /* REG_NOMATCH */
- N_("Invalid regular expression") /* REG_BADPAT */
- N_("Invalid collation character") /* REG_ECOLLATE */
- N_("Invalid character class name") /* REG_ECTYPE */
- N_("Trailing backslash") /* REG_EESCAPE */
- N_("Invalid back reference") /* REG_ESUBREG */
- N_("Unmatched [ or [^") /* REG_EBRACK */
- N_("Unmatched ( or \\(") /* REG_EPAREN */
- N_("Unmatched \\{") /* REG_EBRACE */
- N_("Invalid content of \\{\\}") /* REG_BADBR */
- N_("Invalid range end") /* REG_ERANGE */
- N_("Memory exhausted") /* REG_ESPACE */
- N_("Invalid preceding regular expression") /* REG_BADRPT */
- N_("Premature end of regular expression") /* REG_EEND */
- N_("Regular expression too big") /* REG_ESIZE */
- N_("Unmatched ) or \\)") /* REG_ERPAREN */
-#endif
-
-static const char re_error_msgid[] = "\
-Success\0\
-No match\0\
-Invalid regular expression\0\
-Invalid collation character\0\
-Invalid character class name\0\
-Trailing backslash\0\
-Invalid back reference\0\
-Unmatched [ or [^\0\
-Unmatched ( or \\(\0\
-Unmatched \\{\0\
-Invalid content of \\{\\}\0\
-Invalid range end\0\
-Memory exhausted\0\
-Invalid preceding regular expression\0\
-Premature end of regular expression\0\
-Regular expression too big\0\
-Unmatched ) or \\)";
-
+static const char re_error_msgid[] =
+ {
#define REG_NOERROR_IDX 0
+ gettext_noop ("Success") /* REG_NOERROR */
+ "\0"
#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
+ gettext_noop ("No match") /* REG_NOMATCH */
+ "\0"
#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
+ gettext_noop ("Invalid regular expression") /* REG_BADPAT */
+ "\0"
#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
+ gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
+ "\0"
#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
+ gettext_noop ("Invalid character class name") /* REG_ECTYPE */
+ "\0"
#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
+ gettext_noop ("Trailing backslash") /* REG_EESCAPE */
+ "\0"
#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
+ gettext_noop ("Invalid back reference") /* REG_ESUBREG */
+ "\0"
#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
+ gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
+ "\0"
#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
+ gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
+ "\0"
#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
+ gettext_noop ("Unmatched \\{") /* REG_EBRACE */
+ "\0"
#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
+ gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
+ "\0"
#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
+ gettext_noop ("Invalid range end") /* REG_ERANGE */
+ "\0"
#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
+ gettext_noop ("Memory exhausted") /* REG_ESPACE */
+ "\0"
#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
+ gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
+ "\0"
#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
+ gettext_noop ("Premature end of regular expression") /* REG_EEND */
+ "\0"
#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
+ gettext_noop ("Regular expression too big") /* REG_ESIZE */
+ "\0"
#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
+ gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
+ };
static const size_t re_error_msgid_idx[] =
{
@@ -1160,7 +1433,7 @@ long int re_max_failures = 2000;
union fail_stack_elt
{
- unsigned char *pointer;
+ US_CHAR_TYPE *pointer;
long int integer;
};
@@ -1178,14 +1451,14 @@ typedef struct
# if defined MATCH_MAY_ALLOCATE
/* 4400 was enough to cause a crash on Alpha OSF/1,
whose default stack limit is 2mb. */
-int re_max_failures = 20000;
+int re_max_failures = 4000;
# else
int re_max_failures = 2000;
# endif
union fail_stack_elt
{
- unsigned char *pointer;
+ US_CHAR_TYPE *pointer;
int integer;
};
@@ -1267,7 +1540,7 @@ typedef struct
Assumes the variable `fail_stack'. Probably should only
be called from within `PUSH_FAILURE_POINT'. */
#define PUSH_FAILURE_POINTER(item) \
- fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
+ fail_stack.stack[fail_stack.avail++].pointer = (US_CHAR_TYPE *) (item)
/* This pushes an integer-valued item onto the failure stack.
Assumes the variable `fail_stack'. Probably should only
@@ -1424,12 +1697,11 @@ typedef struct
Also assumes the variables `fail_stack' and (if debugging), `bufp',
`pend', `string1', `size1', `string2', and `size2'. */
-
#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
{ \
DEBUG_STATEMENT (unsigned failure_id;) \
active_reg_t this_reg; \
- const unsigned char *string_temp; \
+ const US_CHAR_TYPE *string_temp; \
\
assert (!FAIL_STACK_EMPTY ()); \
\
@@ -1448,13 +1720,13 @@ typedef struct
saved NULL, thus retaining our current position in the string. */ \
string_temp = POP_FAILURE_POINTER (); \
if (string_temp != NULL) \
- str = (const char *) string_temp; \
+ str = (const CHAR_TYPE *) string_temp; \
\
DEBUG_PRINT2 (" Popping string %p: `", str); \
DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
DEBUG_PRINT1 ("'\n"); \
\
- pat = (unsigned char *) POP_FAILURE_POINTER (); \
+ pat = (US_CHAR_TYPE *) POP_FAILURE_POINTER (); \
DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
\
@@ -1474,10 +1746,10 @@ typedef struct
DEBUG_PRINT2 (" info: %p\n", \
reg_info[this_reg].word.pointer); \
\
- regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
+ regend[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER (); \
DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
\
- regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
+ regstart[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER ();\
DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
} \
else \
@@ -1495,7 +1767,6 @@ typedef struct
DEBUG_STATEMENT (nfailure_points_popped++); \
} /* POP_FAILURE_POINT */
-
/* Structure for per-register (a.k.a. per-group) information.
Other register information, such as the
@@ -1553,7 +1824,7 @@ typedef union
while (0)
/* Registers are set to a sentinel when they haven't yet matched. */
-static char reg_unset_dummy;
+static CHAR_TYPE reg_unset_dummy;
#define REG_UNSET_VALUE (&reg_unset_dummy)
#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
@@ -1562,40 +1833,65 @@ static char reg_unset_dummy;
static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,
reg_syntax_t syntax,
struct re_pattern_buffer *bufp));
-static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
-static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
+static void store_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, int arg));
+static void store_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
int arg1, int arg2));
-static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
- int arg, unsigned char *end));
-static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
- int arg1, int arg2, unsigned char *end));
-static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p,
+static void insert_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
+ int arg, US_CHAR_TYPE *end));
+static void insert_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
+ int arg1, int arg2, US_CHAR_TYPE *end));
+static boolean at_begline_loc_p _RE_ARGS ((const CHAR_TYPE *pattern,
+ const CHAR_TYPE *p,
reg_syntax_t syntax));
-static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend,
+static boolean at_endline_loc_p _RE_ARGS ((const CHAR_TYPE *p,
+ const CHAR_TYPE *pend,
reg_syntax_t syntax));
-static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
- const char *pend,
+#ifdef MBS_SUPPORT
+static reg_errcode_t compile_range _RE_ARGS ((CHAR_TYPE range_start,
+ const CHAR_TYPE **p_ptr,
+ const CHAR_TYPE *pend,
+ char *translate,
+ reg_syntax_t syntax,
+ US_CHAR_TYPE *b,
+ CHAR_TYPE *char_set));
+static void insert_space _RE_ARGS ((int num, CHAR_TYPE *loc, CHAR_TYPE *end));
+#else
+static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
+ const CHAR_TYPE **p_ptr,
+ const CHAR_TYPE *pend,
char *translate,
reg_syntax_t syntax,
- unsigned char *b));
+ US_CHAR_TYPE *b));
+#endif /* MBS_SUPPORT */
/* Fetch the next character in the uncompiled pattern---translating it
if necessary. Also cast from a signed character in the constant
string passed to us by the user to an unsigned char that we can use
as an array index (in, e.g., `translate'). */
+/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
+ because it is impossible to allocate 4GB array for some encodings
+ which have 4 byte character_set like UCS4. */
#ifndef PATFETCH
-# define PATFETCH(c) \
+# ifdef MBS_SUPPORT
+# define PATFETCH(c) \
+ do {if (p == pend) return REG_EEND; \
+ c = (US_CHAR_TYPE) *p++; \
+ if (translate && (c <= 0xff)) c = (US_CHAR_TYPE) translate[c]; \
+ } while (0)
+# else
+# define PATFETCH(c) \
do {if (p == pend) return REG_EEND; \
c = (unsigned char) *p++; \
if (translate) c = (unsigned char) translate[c]; \
} while (0)
+# endif /* MBS_SUPPORT */
#endif
/* Fetch the next character in the uncompiled pattern, with no
translation. */
#define PATFETCH_RAW(c) \
do {if (p == pend) return REG_EEND; \
- c = (unsigned char) *p++; \
+ c = (US_CHAR_TYPE) *p++; \
} while (0)
/* Go backwards one character in the pattern. */
@@ -1606,27 +1902,43 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
cast the subscript to translate because some data is declared as
`char *', to avoid warnings when a string constant is passed. But
when we use a character as a subscript we must make it unsigned. */
+/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
+ because it is impossible to allocate 4GB array for some encodings
+ which have 4 byte character_set like UCS4. */
#ifndef TRANSLATE
-# define TRANSLATE(d) \
+# ifdef MBS_SUPPORT
+# define TRANSLATE(d) \
+ ((translate && ((US_CHAR_TYPE) (d)) <= 0xff) \
+ ? (char) translate[(unsigned char) (d)] : (d))
+#else
+# define TRANSLATE(d) \
(translate ? (char) translate[(unsigned char) (d)] : (d))
+# endif /* MBS_SUPPORT */
#endif
/* Macros for outputting the compiled pattern into `buffer'. */
/* If the buffer isn't allocated when it comes in, use this. */
-#define INIT_BUF_SIZE 32
+#define INIT_BUF_SIZE (32 * sizeof(US_CHAR_TYPE))
/* Make sure we have at least N more bytes of space in buffer. */
-#define GET_BUFFER_SPACE(n) \
+#ifdef MBS_SUPPORT
+# define GET_BUFFER_SPACE(n) \
+ while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
+ + (n)*sizeof(CHAR_TYPE)) > bufp->allocated) \
+ EXTEND_BUFFER ()
+#else
+# define GET_BUFFER_SPACE(n) \
while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
EXTEND_BUFFER ()
+#endif /* MBS_SUPPORT */
/* Make sure we have one more byte of buffer space and then add C to it. */
#define BUF_PUSH(c) \
do { \
GET_BUFFER_SPACE (1); \
- *b++ = (unsigned char) (c); \
+ *b++ = (US_CHAR_TYPE) (c); \
} while (0)
@@ -1634,8 +1946,8 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
#define BUF_PUSH_2(c1, c2) \
do { \
GET_BUFFER_SPACE (2); \
- *b++ = (unsigned char) (c1); \
- *b++ = (unsigned char) (c2); \
+ *b++ = (US_CHAR_TYPE) (c1); \
+ *b++ = (US_CHAR_TYPE) (c2); \
} while (0)
@@ -1643,28 +1955,28 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
#define BUF_PUSH_3(c1, c2, c3) \
do { \
GET_BUFFER_SPACE (3); \
- *b++ = (unsigned char) (c1); \
- *b++ = (unsigned char) (c2); \
- *b++ = (unsigned char) (c3); \
+ *b++ = (US_CHAR_TYPE) (c1); \
+ *b++ = (US_CHAR_TYPE) (c2); \
+ *b++ = (US_CHAR_TYPE) (c3); \
} while (0)
-
/* Store a jump with opcode OP at LOC to location TO. We store a
relative address offset by the three bytes the jump itself occupies. */
#define STORE_JUMP(op, loc, to) \
- store_op1 (op, loc, (int) ((to) - (loc) - 3))
+ store_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
/* Likewise, for a two-argument jump. */
#define STORE_JUMP2(op, loc, to, arg) \
- store_op2 (op, loc, (int) ((to) - (loc) - 3), arg)
+ store_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
#define INSERT_JUMP(op, loc, to) \
- insert_op1 (op, loc, (int) ((to) - (loc) - 3), b)
+ insert_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
#define INSERT_JUMP2(op, loc, to, arg) \
- insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b)
+ insert_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
+ arg, b)
/* This is not an arbitrary limit: the arguments which represent offsets
@@ -1690,31 +2002,90 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
reset the pointers that pointed into the old block to point to the
correct places in the new one. If extending the buffer results in it
being larger than MAX_BUF_SIZE, then flag memory exhausted. */
-#define EXTEND_BUFFER() \
- do { \
- unsigned char *old_buffer = bufp->buffer; \
- if (bufp->allocated == MAX_BUF_SIZE) \
+#if __BOUNDED_POINTERS__
+# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
+# define MOVE_BUFFER_POINTER(P) \
+ (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
+# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
+ else \
+ { \
+ SET_HIGH_BOUND (b); \
+ SET_HIGH_BOUND (begalt); \
+ if (fixup_alt_jump) \
+ SET_HIGH_BOUND (fixup_alt_jump); \
+ if (laststart) \
+ SET_HIGH_BOUND (laststart); \
+ if (pending_exact) \
+ SET_HIGH_BOUND (pending_exact); \
+ }
+#else
+# define MOVE_BUFFER_POINTER(P) (P) += incr
+# define ELSE_EXTEND_BUFFER_HIGH_BOUND
+#endif
+
+#ifdef MBS_SUPPORT
+# define EXTEND_BUFFER() \
+ do { \
+ US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \
+ int wchar_count; \
+ if (bufp->allocated + sizeof(US_CHAR_TYPE) > MAX_BUF_SIZE) \
return REG_ESIZE; \
bufp->allocated <<= 1; \
if (bufp->allocated > MAX_BUF_SIZE) \
- bufp->allocated = MAX_BUF_SIZE; \
- bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\
- if (bufp->buffer == NULL) \
+ bufp->allocated = MAX_BUF_SIZE; \
+ /* How many characters the new buffer can have? */ \
+ wchar_count = bufp->allocated / sizeof(US_CHAR_TYPE); \
+ if (wchar_count == 0) wchar_count = 1; \
+ /* Truncate the buffer to CHAR_TYPE align. */ \
+ bufp->allocated = wchar_count * sizeof(US_CHAR_TYPE); \
+ RETALLOC (COMPILED_BUFFER_VAR, wchar_count, US_CHAR_TYPE); \
+ bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
+ if (COMPILED_BUFFER_VAR == NULL) \
return REG_ESPACE; \
/* If the buffer moved, move all the pointers into it. */ \
- if (old_buffer != bufp->buffer) \
+ if (old_buffer != COMPILED_BUFFER_VAR) \
{ \
- b = (b - old_buffer) + bufp->buffer; \
- begalt = (begalt - old_buffer) + bufp->buffer; \
- if (fixup_alt_jump) \
- fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
- if (laststart) \
- laststart = (laststart - old_buffer) + bufp->buffer; \
- if (pending_exact) \
- pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
+ int incr = COMPILED_BUFFER_VAR - old_buffer; \
+ MOVE_BUFFER_POINTER (b); \
+ MOVE_BUFFER_POINTER (begalt); \
+ if (fixup_alt_jump) \
+ MOVE_BUFFER_POINTER (fixup_alt_jump); \
+ if (laststart) \
+ MOVE_BUFFER_POINTER (laststart); \
+ if (pending_exact) \
+ MOVE_BUFFER_POINTER (pending_exact); \
} \
+ ELSE_EXTEND_BUFFER_HIGH_BOUND \
} while (0)
-
+#else
+# define EXTEND_BUFFER() \
+ do { \
+ US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \
+ if (bufp->allocated == MAX_BUF_SIZE) \
+ return REG_ESIZE; \
+ bufp->allocated <<= 1; \
+ if (bufp->allocated > MAX_BUF_SIZE) \
+ bufp->allocated = MAX_BUF_SIZE; \
+ bufp->buffer = (US_CHAR_TYPE *) REALLOC (COMPILED_BUFFER_VAR, \
+ bufp->allocated); \
+ if (COMPILED_BUFFER_VAR == NULL) \
+ return REG_ESPACE; \
+ /* If the buffer moved, move all the pointers into it. */ \
+ if (old_buffer != COMPILED_BUFFER_VAR) \
+ { \
+ int incr = COMPILED_BUFFER_VAR - old_buffer; \
+ MOVE_BUFFER_POINTER (b); \
+ MOVE_BUFFER_POINTER (begalt); \
+ if (fixup_alt_jump) \
+ MOVE_BUFFER_POINTER (fixup_alt_jump); \
+ if (laststart) \
+ MOVE_BUFFER_POINTER (laststart); \
+ if (pending_exact) \
+ MOVE_BUFFER_POINTER (pending_exact); \
+ } \
+ ELSE_EXTEND_BUFFER_HIGH_BOUND \
+ } while (0)
+#endif /* MBS_SUPPORT */
/* Since we have one byte reserved for the register number argument to
{start,stop}_memory, the maximum number of groups we can report
@@ -1768,20 +2139,20 @@ typedef struct
/* Get the next unsigned number in the uncompiled pattern. */
#define GET_UNSIGNED_NUMBER(num) \
- { if (p != pend) \
- { \
- PATFETCH (c); \
- while ('0' <= c && c <= '9') \
- { \
- if (num < 0) \
- num = 0; \
- num = num * 10 + c - '0'; \
- if (p == pend) \
- break; \
- PATFETCH (c); \
- } \
- } \
- }
+ { \
+ while (p != pend) \
+ { \
+ PATFETCH (c); \
+ if (! ('0' <= c && c <= '9')) \
+ break; \
+ if (num <= RE_DUP_MAX) \
+ { \
+ if (num < 0) \
+ num = 0; \
+ num = num * 10 + c - '0'; \
+ } \
+ } \
+ }
#if defined _LIBC || WIDE_CHAR_SUPPORT
/* The GNU C library provides support for user-defined character classes
@@ -1882,33 +2253,61 @@ static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
examined nor set. */
/* Return, freeing storage we allocated. */
-#define FREE_STACK_RETURN(value) \
+#ifdef MBS_SUPPORT
+# define FREE_STACK_RETURN(value) \
+ return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
+#else
+# define FREE_STACK_RETURN(value) \
return (free (compile_stack.stack), value)
+#endif /* MBS_SUPPORT */
static reg_errcode_t
+#ifdef MBS_SUPPORT
+regex_compile (cpattern, csize, syntax, bufp)
+ const char *cpattern;
+ size_t csize;
+#else
regex_compile (pattern, size, syntax, bufp)
const char *pattern;
size_t size;
+#endif /* MBS_SUPPORT */
reg_syntax_t syntax;
struct re_pattern_buffer *bufp;
{
/* We fetch characters from PATTERN here. Even though PATTERN is
`char *' (i.e., signed), we declare these variables as unsigned, so
they can be reliably used as array indices. */
- register unsigned char c, c1;
+ register US_CHAR_TYPE c, c1;
+
+#ifdef MBS_SUPPORT
+ /* A temporary space to keep wchar_t pattern and compiled pattern. */
+ CHAR_TYPE *pattern, *COMPILED_BUFFER_VAR;
+ size_t size;
+ /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
+ int *mbs_offset = NULL;
+ /* It hold whether each wchar_t is binary data or not. */
+ char *is_binary = NULL;
+ /* A flag whether exactn is handling binary data or not. */
+ char is_exactn_bin = FALSE;
+#endif /* MBS_SUPPORT */
/* A random temporary spot in PATTERN. */
- const char *p1;
+ const CHAR_TYPE *p1;
/* Points to the end of the buffer, where we should append. */
- register unsigned char *b;
+ register US_CHAR_TYPE *b;
/* Keeps track of unclosed groups. */
compile_stack_type compile_stack;
/* Points to the current (ending) position in the pattern. */
- const char *p = pattern;
- const char *pend = pattern + size;
+#ifdef MBS_SUPPORT
+ const CHAR_TYPE *p;
+ const CHAR_TYPE *pend;
+#else
+ const CHAR_TYPE *p = pattern;
+ const CHAR_TYPE *pend = pattern + size;
+#endif /* MBS_SUPPORT */
/* How to translate the characters in the pattern. */
RE_TRANSLATE_TYPE translate = bufp->translate;
@@ -1917,30 +2316,50 @@ regex_compile (pattern, size, syntax, bufp)
command. This makes it possible to tell if a new exact-match
character can be added to that command or if the character requires
a new `exactn' command. */
- unsigned char *pending_exact = 0;
+ US_CHAR_TYPE *pending_exact = 0;
/* Address of start of the most recently finished expression.
This tells, e.g., postfix * where to find the start of its
operand. Reset at the beginning of groups and alternatives. */
- unsigned char *laststart = 0;
+ US_CHAR_TYPE *laststart = 0;
/* Address of beginning of regexp, or inside of last group. */
- unsigned char *begalt;
-
- /* Place in the uncompiled pattern (i.e., the {) to
- which to go back if the interval is invalid. */
- const char *beg_interval;
+ US_CHAR_TYPE *begalt;
/* Address of the place where a forward jump should go to the end of
the containing expression. Each alternative of an `or' -- except the
last -- ends with a forward jump of this sort. */
- unsigned char *fixup_alt_jump = 0;
+ US_CHAR_TYPE *fixup_alt_jump = 0;
/* Counts open-groups as they are encountered. Remembered for the
matching close-group on the compile stack, so the same register
number is put in the stop_memory as the start_memory. */
regnum_t regnum = 0;
+#ifdef MBS_SUPPORT
+ /* Initialize the wchar_t PATTERN and offset_buffer. */
+ p = pend = pattern = TALLOC(csize + 1, CHAR_TYPE);
+ p[csize] = L'\0'; /* sentinel */
+ mbs_offset = TALLOC(csize + 1, int);
+ is_binary = TALLOC(csize + 1, char);
+ if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
+ {
+ if (pattern) free(pattern);
+ if (mbs_offset) free(mbs_offset);
+ if (is_binary) free(is_binary);
+ return REG_ESPACE;
+ }
+ size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
+ pend = p + size;
+ if (size < 0)
+ {
+ if (pattern) free(pattern);
+ if (mbs_offset) free(mbs_offset);
+ if (is_binary) free(is_binary);
+ return REG_BADPAT;
+ }
+#endif
+
#ifdef DEBUG
DEBUG_PRINT1 ("\nCompiling pattern: ");
if (debug)
@@ -1948,7 +2367,7 @@ regex_compile (pattern, size, syntax, bufp)
unsigned debug_count;
for (debug_count = 0; debug_count < size; debug_count++)
- putchar (pattern[debug_count]);
+ PUT_CHAR (pattern[debug_count]);
putchar ('\n');
}
#endif /* DEBUG */
@@ -1956,7 +2375,14 @@ regex_compile (pattern, size, syntax, bufp)
/* Initialize the compile stack. */
compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
if (compile_stack.stack == NULL)
- return REG_ESPACE;
+ {
+#ifdef MBS_SUPPORT
+ if (pattern) free(pattern);
+ if (mbs_offset) free(mbs_offset);
+ if (is_binary) free(is_binary);
+#endif
+ return REG_ESPACE;
+ }
compile_stack.size = INIT_COMPILE_STACK_SIZE;
compile_stack.avail = 0;
@@ -1985,18 +2411,34 @@ regex_compile (pattern, size, syntax, bufp)
{ /* If zero allocated, but buffer is non-null, try to realloc
enough space. This loses if buffer's address is bogus, but
that is the user's responsibility. */
- RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
+#ifdef MBS_SUPPORT
+ /* Free bufp->buffer and allocate an array for wchar_t pattern
+ buffer. */
+ free(bufp->buffer);
+ COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(US_CHAR_TYPE),
+ US_CHAR_TYPE);
+#else
+ RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, US_CHAR_TYPE);
+#endif /* MBS_SUPPORT */
}
else
{ /* Caller did not allocate a buffer. Do it for them. */
- bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
+ COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(US_CHAR_TYPE),
+ US_CHAR_TYPE);
}
- if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
+ if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
+#ifdef MBS_SUPPORT
+ bufp->buffer = (char*)COMPILED_BUFFER_VAR;
+#endif /* MBS_SUPPORT */
bufp->allocated = INIT_BUF_SIZE;
}
+#ifdef MBS_SUPPORT
+ else
+ COMPILED_BUFFER_VAR = (US_CHAR_TYPE*) bufp->buffer;
+#endif
- begalt = b = bufp->buffer;
+ begalt = b = COMPILED_BUFFER_VAR;
/* Loop through the uncompiled pattern until we're at the end. */
while (p != pend)
@@ -2121,7 +2563,7 @@ regex_compile (pattern, size, syntax, bufp)
assert (p - 1 > pattern);
/* Allocate the space for the jump. */
- GET_BUFFER_SPACE (3);
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
/* We know we are not at the first character of the pattern,
because laststart was nonzero. And we've already
@@ -2138,20 +2580,23 @@ regex_compile (pattern, size, syntax, bufp)
}
else
/* Anything else. */
- STORE_JUMP (maybe_pop_jump, b, laststart - 3);
+ STORE_JUMP (maybe_pop_jump, b, laststart -
+ (1 + OFFSET_ADDRESS_SIZE));
/* We've added more stuff to the buffer. */
- b += 3;
+ b += 1 + OFFSET_ADDRESS_SIZE;
}
/* On failure, jump from laststart to b + 3, which will be the
end of the buffer after this jump is inserted. */
- GET_BUFFER_SPACE (3);
+ /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
+ 'b + 3'. */
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
: on_failure_jump,
- laststart, b + 3);
+ laststart, b + 1 + OFFSET_ADDRESS_SIZE);
pending_exact = 0;
- b += 3;
+ b += 1 + OFFSET_ADDRESS_SIZE;
if (!zero_times_ok)
{
@@ -2160,9 +2605,10 @@ regex_compile (pattern, size, syntax, bufp)
`on_failure_jump' instruction of the loop. This
effects a skip over that instruction the first time
we hit that loop. */
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
- b += 3;
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ INSERT_JUMP (dummy_failure_jump, laststart, laststart +
+ 2 + 2 * OFFSET_ADDRESS_SIZE);
+ b += 1 + OFFSET_ADDRESS_SIZE;
}
}
break;
@@ -2177,9 +2623,476 @@ regex_compile (pattern, size, syntax, bufp)
case '[':
{
boolean had_char_class = false;
-
+#ifdef MBS_SUPPORT
+ CHAR_TYPE range_start = 0xffffffff;
+#else
+ unsigned int range_start = 0xffffffff;
+#endif
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+#ifdef MBS_SUPPORT
+ /* We assume a charset(_not) structure as a wchar_t array.
+ charset[0] = (re_opcode_t) charset(_not)
+ charset[1] = l (= length of char_classes)
+ charset[2] = m (= length of collating_symbols)
+ charset[3] = n (= length of equivalence_classes)
+ charset[4] = o (= length of char_ranges)
+ charset[5] = p (= length of chars)
+
+ charset[6] = char_class (wctype_t)
+ charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
+ ...
+ charset[l+5] = char_class (wctype_t)
+
+ charset[l+6] = collating_symbol (wchar_t)
+ ...
+ charset[l+m+5] = collating_symbol (wchar_t)
+ ifdef _LIBC we use the index if
+ _NL_COLLATE_SYMB_EXTRAMB instead of
+ wchar_t string.
+
+ charset[l+m+6] = equivalence_classes (wchar_t)
+ ...
+ charset[l+m+n+5] = equivalence_classes (wchar_t)
+ ifdef _LIBC we use the index in
+ _NL_COLLATE_WEIGHT instead of
+ wchar_t string.
+
+ charset[l+m+n+6] = range_start
+ charset[l+m+n+7] = range_end
+ ...
+ charset[l+m+n+2o+4] = range_start
+ charset[l+m+n+2o+5] = range_end
+ ifdef _LIBC we use the value looked up
+ in _NL_COLLATE_COLLSEQ instead of
+ wchar_t character.
+
+ charset[l+m+n+2o+6] = char
+ ...
+ charset[l+m+n+2o+p+5] = char
+
+ */
+
+ /* We need at least 6 spaces: the opcode, the length of
+ char_classes, the length of collating_symbols, the length of
+ equivalence_classes, the length of char_ranges, the length of
+ chars. */
+ GET_BUFFER_SPACE (6);
+
+ /* Save b as laststart. And We use laststart as the pointer
+ to the first element of the charset here.
+ In other words, laststart[i] indicates charset[i]. */
+ laststart = b;
+
+ /* We test `*p == '^' twice, instead of using an if
+ statement, so we only need one BUF_PUSH. */
+ BUF_PUSH (*p == '^' ? charset_not : charset);
+ if (*p == '^')
+ p++;
+
+ /* Push the length of char_classes, the length of
+ collating_symbols, the length of equivalence_classes, the
+ length of char_ranges and the length of chars. */
+ BUF_PUSH_3 (0, 0, 0);
+ BUF_PUSH_2 (0, 0);
+
+ /* Remember the first position in the bracket expression. */
+ p1 = p;
+
+ /* charset_not matches newline according to a syntax bit. */
+ if ((re_opcode_t) b[-6] == charset_not
+ && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
+ {
+ BUF_PUSH('\n');
+ laststart[5]++; /* Update the length of characters */
+ }
+
+ /* Read in characters and ranges, setting map bits. */
+ for (;;)
+ {
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ PATFETCH (c);
+
+ /* \ might escape characters inside [...] and [^...]. */
+ if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
+ {
+ if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
+
+ PATFETCH (c1);
+ BUF_PUSH(c1);
+ laststart[5]++; /* Update the length of chars */
+ range_start = c1;
+ continue;
+ }
+
+ /* Could be the end of the bracket expression. If it's
+ not (i.e., when the bracket expression is `[]' so
+ far), the ']' character bit gets set way below. */
+ if (c == ']' && p != p1 + 1)
+ break;
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character class. */
+ if (had_char_class && c == '-' && *p != ']')
+ FREE_STACK_RETURN (REG_ERANGE);
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character: if this is a hyphen not at the
+ beginning or the end of a list, then it's the range
+ operator. */
+ if (c == '-'
+ && !(p - 2 >= pattern && p[-2] == '[')
+ && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
+ && *p != ']')
+ {
+ reg_errcode_t ret;
+ /* Allocate the space for range_start and range_end. */
+ GET_BUFFER_SPACE (2);
+ /* Update the pointer to indicate end of buffer. */
+ b += 2;
+ ret = compile_range (range_start, &p, pend, translate,
+ syntax, b, laststart);
+ if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ range_start = 0xffffffff;
+ }
+ else if (p[0] == '-' && p[1] != ']')
+ { /* This handles ranges made up of characters only. */
+ reg_errcode_t ret;
+
+ /* Move past the `-'. */
+ PATFETCH (c1);
+ /* Allocate the space for range_start and range_end. */
+ GET_BUFFER_SPACE (2);
+ /* Update the pointer to indicate end of buffer. */
+ b += 2;
+ ret = compile_range (c, &p, pend, translate, syntax, b,
+ laststart);
+ if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ range_start = 0xffffffff;
+ }
+
+ /* See if we're at the beginning of a possible character
+ class. */
+ else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
+ { /* Leave room for the null. */
+ char str[CHAR_CLASS_MAX_LENGTH + 1];
+
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[:'. */
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ for (;;)
+ {
+ PATFETCH (c);
+ if ((c == ':' && *p == ']') || p == pend)
+ break;
+ if (c1 < CHAR_CLASS_MAX_LENGTH)
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
+ }
+ str[c1] = '\0';
+
+ /* If isn't a word bracketed by `[:' and `:]':
+ undo the ending character, the letters, and leave
+ the leading `:' and `[' (but store them as character). */
+ if (c == ':' && *p == ']')
+ {
+ wctype_t wt;
+ uintptr_t alignedp;
+
+ /* Query the character class as wctype_t. */
+ wt = IS_CHAR_CLASS (str);
+ if (wt == 0)
+ FREE_STACK_RETURN (REG_ECTYPE);
+
+ /* Throw away the ] at the end of the character
+ class. */
+ PATFETCH (c);
+
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ /* Allocate the space for character class. */
+ GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
+ /* Update the pointer to indicate end of buffer. */
+ b += CHAR_CLASS_SIZE;
+ /* Move data which follow character classes
+ not to violate the data. */
+ insert_space(CHAR_CLASS_SIZE,
+ laststart + 6 + laststart[1],
+ b - 1);
+ alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
+ + __alignof__(wctype_t) - 1)
+ & ~(uintptr_t)(__alignof__(wctype_t) - 1);
+ /* Store the character class. */
+ *((wctype_t*)alignedp) = wt;
+ /* Update length of char_classes */
+ laststart[1] += CHAR_CLASS_SIZE;
+
+ had_char_class = true;
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ BUF_PUSH ('[');
+ BUF_PUSH (':');
+ laststart[5] += 2; /* Update the length of characters */
+ range_start = ':';
+ had_char_class = false;
+ }
+ }
+ else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
+ || *p == '.'))
+ {
+ CHAR_TYPE str[128]; /* Should be large enough. */
+ CHAR_TYPE delim = *p; /* '=' or '.' */
+# ifdef _LIBC
+ uint32_t nrules =
+ _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+# endif
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[=' or '[[.'. */
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ for (;;)
+ {
+ PATFETCH (c);
+ if ((c == delim && *p == ']') || p == pend)
+ break;
+ if (c1 < sizeof (str) - 1)
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
+ }
+ str[c1] = '\0';
+
+ if (c == delim && *p == ']' && str[0] != '\0')
+ {
+ unsigned int i, offset;
+ /* If we have no collation data we use the default
+ collation in which each character is in a class
+ by itself. It also means that ASCII is the
+ character set and therefore we cannot have character
+ with more than one byte in the multibyte
+ representation. */
+
+ /* If not defined _LIBC, we push the name and
+ `\0' for the sake of matching performance. */
+ int datasize = c1 + 1;
+
+# ifdef _LIBC
+ int32_t idx = 0;
+ if (nrules == 0)
+# endif
+ {
+ if (c1 != 1)
+ FREE_STACK_RETURN (REG_ECOLLATE);
+ }
+# ifdef _LIBC
+ else
+ {
+ const int32_t *table;
+ const int32_t *weights;
+ const int32_t *extra;
+ const int32_t *indirect;
+ wint_t *cp;
+
+ /* This #include defines a local function! */
+# include <locale/weightwc.h>
+
+ if(delim == '=')
+ {
+ /* We push the index for equivalence class. */
+ cp = (wint_t*)str;
+
+ table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_TABLEWC);
+ weights = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_WEIGHTWC);
+ extra = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_EXTRAWC);
+ indirect = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_INDIRECTWC);
+
+ idx = findidx ((const wint_t**)&cp);
+ if (idx == 0 || cp < (wint_t*) str + c1)
+ /* This is no valid character. */
+ FREE_STACK_RETURN (REG_ECOLLATE);
+
+ str[0] = (wchar_t)idx;
+ }
+ else /* delim == '.' */
+ {
+ /* We push collation sequence value
+ for collating symbol. */
+ int32_t table_size;
+ const int32_t *symb_table;
+ const unsigned char *extra;
+ int32_t idx;
+ int32_t elem;
+ int32_t second;
+ int32_t hash;
+ char char_str[c1];
+
+ /* We have to convert the name to a single-byte
+ string. This is possible since the names
+ consist of ASCII characters and the internal
+ representation is UCS4. */
+ for (i = 0; i < c1; ++i)
+ char_str[i] = str[i];
+
+ table_size =
+ _NL_CURRENT_WORD (LC_COLLATE,
+ _NL_COLLATE_SYMB_HASH_SIZEMB);
+ symb_table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_SYMB_TABLEMB);
+ extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_SYMB_EXTRAMB);
+
+ /* Locate the character in the hashing table. */
+ hash = elem_hash (char_str, c1);
+
+ idx = 0;
+ elem = hash % table_size;
+ second = hash % (table_size - 2);
+ while (symb_table[2 * elem] != 0)
+ {
+ /* First compare the hashing value. */
+ if (symb_table[2 * elem] == hash
+ && c1 == extra[symb_table[2 * elem + 1]]
+ && memcmp (str,
+ &extra[symb_table[2 * elem + 1]
+ + 1], c1) == 0)
+ {
+ /* Yep, this is the entry. */
+ idx = symb_table[2 * elem + 1];
+ idx += 1 + extra[idx];
+ break;
+ }
+
+ /* Next entry. */
+ elem += second;
+ }
+
+ if (symb_table[2 * elem] != 0)
+ {
+ /* Compute the index of the byte sequence
+ in the table. */
+ idx += 1 + extra[idx];
+ /* Adjust for the alignment. */
+ idx = (idx + 3) & ~4;
+
+ str[0] = (wchar_t) idx + 4;
+ }
+ else if (symb_table[2 * elem] == 0 && c1 == 1)
+ {
+ /* No valid character. Match it as a
+ single byte character. */
+ had_char_class = false;
+ BUF_PUSH(str[0]);
+ /* Update the length of characters */
+ laststart[5]++;
+ range_start = str[0];
+
+ /* Throw away the ] at the end of the
+ collating symbol. */
+ PATFETCH (c);
+ /* exit from the switch block. */
+ continue;
+ }
+ else
+ FREE_STACK_RETURN (REG_ECOLLATE);
+ }
+ datasize = 1;
+ }
+# endif
+ /* Throw away the ] at the end of the equivalence
+ class (or collating symbol). */
+ PATFETCH (c);
+
+ /* Allocate the space for the equivalence class
+ (or collating symbol) (and '\0' if needed). */
+ GET_BUFFER_SPACE(datasize);
+ /* Update the pointer to indicate end of buffer. */
+ b += datasize;
+
+ if (delim == '=')
+ { /* equivalence class */
+ /* Calculate the offset of char_ranges,
+ which is next to equivalence_classes. */
+ offset = laststart[1] + laststart[2]
+ + laststart[3] +6;
+ /* Insert space. */
+ insert_space(datasize, laststart + offset, b - 1);
+
+ /* Write the equivalence_class and \0. */
+ for (i = 0 ; i < datasize ; i++)
+ laststart[offset + i] = str[i];
+
+ /* Update the length of equivalence_classes. */
+ laststart[3] += datasize;
+ had_char_class = true;
+ }
+ else /* delim == '.' */
+ { /* collating symbol */
+ /* Calculate the offset of the equivalence_classes,
+ which is next to collating_symbols. */
+ offset = laststart[1] + laststart[2] + 6;
+ /* Insert space and write the collationg_symbol
+ and \0. */
+ insert_space(datasize, laststart + offset, b-1);
+ for (i = 0 ; i < datasize ; i++)
+ laststart[offset + i] = str[i];
+
+ /* In re_match_2_internal if range_start < -1, we
+ assume -range_start is the offset of the
+ collating symbol which is specified as
+ the character of the range start. So we assign
+ -(laststart[1] + laststart[2] + 6) to
+ range_start. */
+ range_start = -(laststart[1] + laststart[2] + 6);
+ /* Update the length of collating_symbol. */
+ laststart[2] += datasize;
+ had_char_class = false;
+ }
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ BUF_PUSH ('[');
+ BUF_PUSH (delim);
+ laststart[5] += 2; /* Update the length of characters */
+ range_start = delim;
+ had_char_class = false;
+ }
+ }
+ else
+ {
+ had_char_class = false;
+ BUF_PUSH(c);
+ laststart[5]++; /* Update the length of characters */
+ range_start = c;
+ }
+ }
+
+#else /* not MBS_SUPPORT */
/* Ensure that we have enough space to push a charset: the
opcode, the length count, and the bitset; 34 bytes in all. */
GET_BUFFER_SPACE (34);
@@ -2220,6 +3133,7 @@ regex_compile (pattern, size, syntax, bufp)
PATFETCH (c1);
SET_LIST_BIT (c1);
+ range_start = c1;
continue;
}
@@ -2244,8 +3158,10 @@ regex_compile (pattern, size, syntax, bufp)
&& *p != ']')
{
reg_errcode_t ret
- = compile_range (&p, pend, translate, syntax, b);
+ = compile_range (range_start, &p, pend, translate,
+ syntax, b);
if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ range_start = 0xffffffff;
}
else if (p[0] == '-' && p[1] != ']')
@@ -2255,8 +3171,9 @@ regex_compile (pattern, size, syntax, bufp)
/* Move past the `-'. */
PATFETCH (c1);
- ret = compile_range (&p, pend, translate, syntax, b);
+ ret = compile_range (c, &p, pend, translate, syntax, b);
if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ range_start = 0xffffffff;
}
/* See if we're at the beginning of a possible character
@@ -2290,7 +3207,7 @@ regex_compile (pattern, size, syntax, bufp)
the leading `:' and `[' (but set bits for them). */
if (c == ':' && *p == ']')
{
-#if defined _LIBC || WIDE_CHAR_SUPPORT
+# if defined _LIBC || WIDE_CHAR_SUPPORT
boolean is_lower = STREQ (str, "lower");
boolean is_upper = STREQ (str, "upper");
wctype_t wt;
@@ -2308,13 +3225,13 @@ regex_compile (pattern, size, syntax, bufp)
for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
{
-# ifdef _LIBC
+# ifdef _LIBC
if (__iswctype (__btowc (ch), wt))
SET_LIST_BIT (ch);
-# else
+# else
if (iswctype (btowc (ch), wt))
SET_LIST_BIT (ch);
-# endif
+# endif
if (translate && (is_upper || is_lower)
&& (ISUPPER (ch) || ISLOWER (ch)))
@@ -2322,7 +3239,7 @@ regex_compile (pattern, size, syntax, bufp)
}
had_char_class = true;
-#else
+# else
int ch;
boolean is_alnum = STREQ (str, "alnum");
boolean is_alpha = STREQ (str, "alpha");
@@ -2370,7 +3287,7 @@ regex_compile (pattern, size, syntax, bufp)
SET_LIST_BIT (ch);
}
had_char_class = true;
-#endif /* libc || wctype.h */
+# endif /* libc || wctype.h */
}
else
{
@@ -2379,13 +3296,292 @@ regex_compile (pattern, size, syntax, bufp)
PATUNFETCH;
SET_LIST_BIT ('[');
SET_LIST_BIT (':');
+ range_start = ':';
had_char_class = false;
}
}
+ else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
+ {
+ unsigned char str[MB_LEN_MAX + 1];
+# ifdef _LIBC
+ uint32_t nrules =
+ _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+# endif
+
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[='. */
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ for (;;)
+ {
+ PATFETCH (c);
+ if ((c == '=' && *p == ']') || p == pend)
+ break;
+ if (c1 < MB_LEN_MAX)
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
+ }
+ str[c1] = '\0';
+
+ if (c == '=' && *p == ']' && str[0] != '\0')
+ {
+ /* If we have no collation data we use the default
+ collation in which each character is in a class
+ by itself. It also means that ASCII is the
+ character set and therefore we cannot have character
+ with more than one byte in the multibyte
+ representation. */
+# ifdef _LIBC
+ if (nrules == 0)
+# endif
+ {
+ if (c1 != 1)
+ FREE_STACK_RETURN (REG_ECOLLATE);
+
+ /* Throw away the ] at the end of the equivalence
+ class. */
+ PATFETCH (c);
+
+ /* Set the bit for the character. */
+ SET_LIST_BIT (str[0]);
+ }
+# ifdef _LIBC
+ else
+ {
+ /* Try to match the byte sequence in `str' against
+ those known to the collate implementation.
+ First find out whether the bytes in `str' are
+ actually from exactly one character. */
+ const int32_t *table;
+ const unsigned char *weights;
+ const unsigned char *extra;
+ const int32_t *indirect;
+ int32_t idx;
+ const unsigned char *cp = str;
+ int ch;
+
+ /* This #include defines a local function! */
+# include <locale/weight.h>
+
+ table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
+ weights = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
+ extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
+ indirect = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
+
+ idx = findidx (&cp);
+ if (idx == 0 || cp < str + c1)
+ /* This is no valid character. */
+ FREE_STACK_RETURN (REG_ECOLLATE);
+
+ /* Throw away the ] at the end of the equivalence
+ class. */
+ PATFETCH (c);
+
+ /* Now we have to go throught the whole table
+ and find all characters which have the same
+ first level weight.
+
+ XXX Note that this is not entirely correct.
+ we would have to match multibyte sequences
+ but this is not possible with the current
+ implementation. */
+ for (ch = 1; ch < 256; ++ch)
+ /* XXX This test would have to be changed if we
+ would allow matching multibyte sequences. */
+ if (table[ch] > 0)
+ {
+ int32_t idx2 = table[ch];
+ size_t len = weights[idx2];
+
+ /* Test whether the lenghts match. */
+ if (weights[idx] == len)
+ {
+ /* They do. New compare the bytes of
+ the weight. */
+ size_t cnt = 0;
+
+ while (cnt < len
+ && (weights[idx + 1 + cnt]
+ == weights[idx2 + 1 + cnt]))
+ ++cnt;
+
+ if (cnt == len)
+ /* They match. Mark the character as
+ acceptable. */
+ SET_LIST_BIT (ch);
+ }
+ }
+ }
+# endif
+ had_char_class = true;
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ SET_LIST_BIT ('[');
+ SET_LIST_BIT ('=');
+ range_start = '=';
+ had_char_class = false;
+ }
+ }
+ else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
+ {
+ unsigned char str[128]; /* Should be large enough. */
+# ifdef _LIBC
+ uint32_t nrules =
+ _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+# endif
+
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[.'. */
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ for (;;)
+ {
+ PATFETCH (c);
+ if ((c == '.' && *p == ']') || p == pend)
+ break;
+ if (c1 < sizeof (str))
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
+ }
+ str[c1] = '\0';
+
+ if (c == '.' && *p == ']' && str[0] != '\0')
+ {
+ /* If we have no collation data we use the default
+ collation in which each character is the name
+ for its own class which contains only the one
+ character. It also means that ASCII is the
+ character set and therefore we cannot have character
+ with more than one byte in the multibyte
+ representation. */
+# ifdef _LIBC
+ if (nrules == 0)
+# endif
+ {
+ if (c1 != 1)
+ FREE_STACK_RETURN (REG_ECOLLATE);
+
+ /* Throw away the ] at the end of the equivalence
+ class. */
+ PATFETCH (c);
+
+ /* Set the bit for the character. */
+ SET_LIST_BIT (str[0]);
+ range_start = ((const unsigned char *) str)[0];
+ }
+# ifdef _LIBC
+ else
+ {
+ /* Try to match the byte sequence in `str' against
+ those known to the collate implementation.
+ First find out whether the bytes in `str' are
+ actually from exactly one character. */
+ int32_t table_size;
+ const int32_t *symb_table;
+ const unsigned char *extra;
+ int32_t idx;
+ int32_t elem;
+ int32_t second;
+ int32_t hash;
+
+ table_size =
+ _NL_CURRENT_WORD (LC_COLLATE,
+ _NL_COLLATE_SYMB_HASH_SIZEMB);
+ symb_table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_SYMB_TABLEMB);
+ extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_SYMB_EXTRAMB);
+
+ /* Locate the character in the hashing table. */
+ hash = elem_hash (str, c1);
+
+ idx = 0;
+ elem = hash % table_size;
+ second = hash % (table_size - 2);
+ while (symb_table[2 * elem] != 0)
+ {
+ /* First compare the hashing value. */
+ if (symb_table[2 * elem] == hash
+ && c1 == extra[symb_table[2 * elem + 1]]
+ && memcmp (str,
+ &extra[symb_table[2 * elem + 1]
+ + 1],
+ c1) == 0)
+ {
+ /* Yep, this is the entry. */
+ idx = symb_table[2 * elem + 1];
+ idx += 1 + extra[idx];
+ break;
+ }
+
+ /* Next entry. */
+ elem += second;
+ }
+
+ if (symb_table[2 * elem] == 0)
+ /* This is no valid character. */
+ FREE_STACK_RETURN (REG_ECOLLATE);
+
+ /* Throw away the ] at the end of the equivalence
+ class. */
+ PATFETCH (c);
+
+ /* Now add the multibyte character(s) we found
+ to the accept list.
+
+ XXX Note that this is not entirely correct.
+ we would have to match multibyte sequences
+ but this is not possible with the current
+ implementation. Also, we have to match
+ collating symbols, which expand to more than
+ one file, as a whole and not allow the
+ individual bytes. */
+ c1 = extra[idx++];
+ if (c1 == 1)
+ range_start = extra[idx];
+ while (c1-- > 0)
+ {
+ SET_LIST_BIT (extra[idx]);
+ ++idx;
+ }
+ }
+# endif
+ had_char_class = false;
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ SET_LIST_BIT ('[');
+ SET_LIST_BIT ('.');
+ range_start = '.';
+ had_char_class = false;
+ }
+ }
else
{
had_char_class = false;
SET_LIST_BIT (c);
+ range_start = c;
}
}
@@ -2394,6 +3590,7 @@ regex_compile (pattern, size, syntax, bufp)
while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
b[-1]--;
b += b[-1];
+#endif /* MBS_SUPPORT */
}
break;
@@ -2464,10 +3661,10 @@ regex_compile (pattern, size, syntax, bufp)
group. They are all relative offsets, so that if the
whole pattern moves because of realloc, they will still
be valid. */
- COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
+ COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
COMPILE_STACK_TOP.fixup_alt_jump
- = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
- COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
+ = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
+ COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
COMPILE_STACK_TOP.regnum = regnum;
/* We will eventually replace the 0 with the number of
@@ -2476,7 +3673,8 @@ regex_compile (pattern, size, syntax, bufp)
represent in the compiled pattern. */
if (regnum <= MAX_REGNUM)
{
- COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
+ COMPILE_STACK_TOP.inner_group_offset = b
+ - COMPILED_BUFFER_VAR + 2;
BUF_PUSH_3 (start_memory, regnum, 0);
}
@@ -2535,12 +3733,12 @@ regex_compile (pattern, size, syntax, bufp)
regnum_t this_group_regnum;
compile_stack.avail--;
- begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
+ begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
fixup_alt_jump
= COMPILE_STACK_TOP.fixup_alt_jump
- ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
+ ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
: 0;
- laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
+ laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
this_group_regnum = COMPILE_STACK_TOP.regnum;
/* If we've reached MAX_REGNUM groups, then this open
won't actually generate any code, so we'll have to
@@ -2551,8 +3749,8 @@ regex_compile (pattern, size, syntax, bufp)
groups were inside this one. */
if (this_group_regnum <= MAX_REGNUM)
{
- unsigned char *inner_group_loc
- = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
+ US_CHAR_TYPE *inner_group_loc
+ = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
*inner_group_loc = regnum - this_group_regnum;
BUF_PUSH_3 (stop_memory, this_group_regnum,
@@ -2571,10 +3769,11 @@ regex_compile (pattern, size, syntax, bufp)
/* Insert before the previous alternative a jump which
jumps to this alternative if the former fails. */
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (on_failure_jump, begalt, b + 6);
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ INSERT_JUMP (on_failure_jump, begalt,
+ b + 2 + 2 * OFFSET_ADDRESS_SIZE);
pending_exact = 0;
- b += 3;
+ b += 1 + OFFSET_ADDRESS_SIZE;
/* The alternative before this one has a jump after it
which gets executed if it gets matched. Adjust that
@@ -2599,8 +3798,8 @@ regex_compile (pattern, size, syntax, bufp)
to be filled in later either by next alternative or
when know we're at the end of a series of alternatives. */
fixup_alt_jump = b;
- GET_BUFFER_SPACE (3);
- b += 3;
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ b += 1 + OFFSET_ADDRESS_SIZE;
laststart = 0;
begalt = b;
@@ -2612,8 +3811,7 @@ regex_compile (pattern, size, syntax, bufp)
if (!(syntax & RE_INTERVALS)
/* If we're at `\{' and it's not the open-interval
operator. */
- || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
- || (p - 2 == pattern && p == pend))
+ || (syntax & RE_NO_BK_BRACES))
goto normal_backslash;
handle_interval:
@@ -2623,57 +3821,43 @@ regex_compile (pattern, size, syntax, bufp)
/* At least (most) this many matches must be made. */
int lower_bound = -1, upper_bound = -1;
- beg_interval = p - 1;
+ /* Place in the uncompiled pattern (i.e., just after
+ the '{') to go back to if the interval is invalid. */
+ const CHAR_TYPE *beg_interval = p;
if (p == pend)
- {
- if (syntax & RE_NO_BK_BRACES)
- goto unfetch_interval;
- else
- FREE_STACK_RETURN (REG_EBRACE);
- }
+ goto invalid_interval;
GET_UNSIGNED_NUMBER (lower_bound);
if (c == ',')
{
GET_UNSIGNED_NUMBER (upper_bound);
- if (upper_bound < 0) upper_bound = RE_DUP_MAX;
+ if (upper_bound < 0)
+ upper_bound = RE_DUP_MAX;
}
else
/* Interval such as `{1}' => match exactly once. */
upper_bound = lower_bound;
- if (lower_bound < 0 || upper_bound > RE_DUP_MAX
- || lower_bound > upper_bound)
- {
- if (syntax & RE_NO_BK_BRACES)
- goto unfetch_interval;
- else
- FREE_STACK_RETURN (REG_BADBR);
- }
+ if (! (0 <= lower_bound && lower_bound <= upper_bound))
+ goto invalid_interval;
if (!(syntax & RE_NO_BK_BRACES))
{
- if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
-
+ if (c != '\\' || p == pend)
+ goto invalid_interval;
PATFETCH (c);
}
if (c != '}')
- {
- if (syntax & RE_NO_BK_BRACES)
- goto unfetch_interval;
- else
- FREE_STACK_RETURN (REG_BADBR);
- }
-
- /* We just parsed a valid interval. */
+ goto invalid_interval;
/* If it's invalid to have no preceding re. */
if (!laststart)
{
- if (syntax & RE_CONTEXT_INVALID_OPS)
+ if (syntax & RE_CONTEXT_INVALID_OPS
+ && !(syntax & RE_INVALID_INTERVAL_ORD))
FREE_STACK_RETURN (REG_BADRPT);
else if (syntax & RE_CONTEXT_INDEP_OPS)
laststart = b;
@@ -2681,14 +3865,22 @@ regex_compile (pattern, size, syntax, bufp)
goto unfetch_interval;
}
+ /* We just parsed a valid interval. */
+
+ if (RE_DUP_MAX < upper_bound)
+ FREE_STACK_RETURN (REG_BADBR);
+
/* If the upper bound is zero, don't want to succeed at
all; jump from `laststart' to `b + 3', which will be
- the end of the buffer after we insert the jump. */
+ the end of the buffer after we insert the jump. */
+ /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE'
+ instead of 'b + 3'. */
if (upper_bound == 0)
{
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (jump, laststart, b + 3);
- b += 3;
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ INSERT_JUMP (jump, laststart, b + 1
+ + OFFSET_ADDRESS_SIZE);
+ b += 1 + OFFSET_ADDRESS_SIZE;
}
/* Otherwise, we have a nontrivial interval. When
@@ -2703,7 +3895,8 @@ regex_compile (pattern, size, syntax, bufp)
else
{ /* If the upper bound is > 1, we need to insert
more at the end of the loop. */
- unsigned nbytes = 10 + (upper_bound > 1) * 10;
+ unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
+ (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
GET_BUFFER_SPACE (nbytes);
@@ -2713,16 +3906,21 @@ regex_compile (pattern, size, syntax, bufp)
because `re_compile_fastmap' needs to know.
Jump to the `jump_n' we might insert below. */
INSERT_JUMP2 (succeed_n, laststart,
- b + 5 + (upper_bound > 1) * 5,
- lower_bound);
- b += 5;
+ b + 1 + 2 * OFFSET_ADDRESS_SIZE
+ + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
+ , lower_bound);
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
/* Code to initialize the lower bound. Insert
before the `succeed_n'. The `5' is the last two
bytes of this `set_number_at', plus 3 bytes of
the following `succeed_n'. */
- insert_op2 (set_number_at, laststart, 5, lower_bound, b);
- b += 5;
+ /* ifdef MBS_SUPPORT, The '1+2*OFFSET_ADDRESS_SIZE'
+ is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
+ of the following `succeed_n'. */
+ insert_op2 (set_number_at, laststart, 1
+ + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
if (upper_bound > 1)
{ /* More than one repetition is allowed, so
@@ -2732,9 +3930,10 @@ regex_compile (pattern, size, syntax, bufp)
When we've reached this during matching,
we'll have matched the interval once, so
jump back only `upper_bound - 1' times. */
- STORE_JUMP2 (jump_n, b, laststart + 5,
+ STORE_JUMP2 (jump_n, b, laststart
+ + 2 * OFFSET_ADDRESS_SIZE + 1,
upper_bound - 1);
- b += 5;
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
/* The location we want to set is the second
parameter of the `jump_n'; that is `b-2' as
@@ -2752,29 +3951,24 @@ regex_compile (pattern, size, syntax, bufp)
reinitialize the bounds. */
insert_op2 (set_number_at, laststart, b - laststart,
upper_bound - 1, b);
- b += 5;
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
}
}
pending_exact = 0;
- beg_interval = NULL;
- }
- break;
-
- unfetch_interval:
- /* If an invalid interval, match the characters as literals. */
- assert (beg_interval);
- p = beg_interval;
- beg_interval = NULL;
-
- /* normal_char and normal_backslash need `c'. */
- PATFETCH (c);
-
- if (!(syntax & RE_NO_BK_BRACES))
- {
- if (p > pattern && p[-1] == '\\')
- goto normal_backslash;
- }
- goto normal_char;
+ break;
+
+ invalid_interval:
+ if (!(syntax & RE_INVALID_INTERVAL_ORD))
+ FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
+ unfetch_interval:
+ /* Match the characters as literals. */
+ p = beg_interval;
+ c = '{';
+ if (syntax & RE_NO_BK_BRACES)
+ goto normal_char;
+ else
+ goto normal_backslash;
+ }
#ifdef emacs
/* There is no way to specify the before_dot and after_dot
@@ -2891,6 +4085,11 @@ regex_compile (pattern, size, syntax, bufp)
normal_char:
/* If no exactn currently being built. */
if (!pending_exact
+#ifdef MBS_SUPPORT
+ /* If last exactn handle binary(or character) and
+ new exactn handle character(or binary). */
+ || is_exactn_bin != is_binary[p - 1 - pattern]
+#endif /* MBS_SUPPORT */
/* If last exactn not at current position. */
|| pending_exact + *pending_exact + 1 != b
@@ -2912,7 +4111,16 @@ regex_compile (pattern, size, syntax, bufp)
laststart = b;
+#ifdef MBS_SUPPORT
+ /* Is this exactn binary data or character? */
+ is_exactn_bin = is_binary[p - 1 - pattern];
+ if (is_exactn_bin)
+ BUF_PUSH_2 (exactn_bin, 0);
+ else
+ BUF_PUSH_2 (exactn, 0);
+#else
BUF_PUSH_2 (exactn, 0);
+#endif /* MBS_SUPPORT */
pending_exact = b - 1;
}
@@ -2936,10 +4144,19 @@ regex_compile (pattern, size, syntax, bufp)
if (syntax & RE_NO_POSIX_BACKTRACKING)
BUF_PUSH (succeed);
+#ifdef MBS_SUPPORT
+ free (pattern);
+ free (mbs_offset);
+ free (is_binary);
+#endif
free (compile_stack.stack);
/* We have succeeded; set the length of the buffer. */
+#ifdef MBS_SUPPORT
+ bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR;
+#else
bufp->used = b - bufp->buffer;
+#endif
#ifdef DEBUG
if (debug)
@@ -2996,44 +4213,47 @@ regex_compile (pattern, size, syntax, bufp)
/* Subroutines for `regex_compile'. */
/* Store OP at LOC followed by two-byte integer parameter ARG. */
+/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
static void
store_op1 (op, loc, arg)
re_opcode_t op;
- unsigned char *loc;
+ US_CHAR_TYPE *loc;
int arg;
{
- *loc = (unsigned char) op;
+ *loc = (US_CHAR_TYPE) op;
STORE_NUMBER (loc + 1, arg);
}
/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
+/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
static void
store_op2 (op, loc, arg1, arg2)
re_opcode_t op;
- unsigned char *loc;
+ US_CHAR_TYPE *loc;
int arg1, arg2;
{
- *loc = (unsigned char) op;
+ *loc = (US_CHAR_TYPE) op;
STORE_NUMBER (loc + 1, arg1);
- STORE_NUMBER (loc + 3, arg2);
+ STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
}
/* Copy the bytes from LOC to END to open up three bytes of space at LOC
for OP followed by two-byte integer parameter ARG. */
+/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
static void
insert_op1 (op, loc, arg, end)
re_opcode_t op;
- unsigned char *loc;
+ US_CHAR_TYPE *loc;
int arg;
- unsigned char *end;
+ US_CHAR_TYPE *end;
{
- register unsigned char *pfrom = end;
- register unsigned char *pto = end + 3;
+ register US_CHAR_TYPE *pfrom = end;
+ register US_CHAR_TYPE *pto = end + 1 + OFFSET_ADDRESS_SIZE;
while (pfrom != loc)
*--pto = *--pfrom;
@@ -3043,16 +4263,17 @@ insert_op1 (op, loc, arg, end)
/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
+/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
static void
insert_op2 (op, loc, arg1, arg2, end)
re_opcode_t op;
- unsigned char *loc;
+ US_CHAR_TYPE *loc;
int arg1, arg2;
- unsigned char *end;
+ US_CHAR_TYPE *end;
{
- register unsigned char *pfrom = end;
- register unsigned char *pto = end + 5;
+ register US_CHAR_TYPE *pfrom = end;
+ register US_CHAR_TYPE *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
while (pfrom != loc)
*--pto = *--pfrom;
@@ -3067,10 +4288,10 @@ insert_op2 (op, loc, arg1, arg2, end)
static boolean
at_begline_loc_p (pattern, p, syntax)
- const char *pattern, *p;
+ const CHAR_TYPE *pattern, *p;
reg_syntax_t syntax;
{
- const char *prev = p - 2;
+ const CHAR_TYPE *prev = p - 2;
boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
return
@@ -3086,12 +4307,12 @@ at_begline_loc_p (pattern, p, syntax)
static boolean
at_endline_loc_p (p, pend, syntax)
- const char *p, *pend;
+ const CHAR_TYPE *p, *pend;
reg_syntax_t syntax;
{
- const char *next = p;
+ const CHAR_TYPE *next = p;
boolean next_backslash = *next == '\\';
- const char *next_next = p + 1 < pend ? p + 1 : 0;
+ const CHAR_TYPE *next_next = p + 1 < pend ? p + 1 : 0;
return
/* Before a subexpression? */
@@ -3122,7 +4343,100 @@ group_in_compile_stack (compile_stack, regnum)
return false;
}
+#ifdef MBS_SUPPORT
+/* This insert space, which size is "num", into the pattern at "loc".
+ "end" must point the end of the allocated buffer. */
+static void
+insert_space (num, loc, end)
+ int num;
+ CHAR_TYPE *loc;
+ CHAR_TYPE *end;
+{
+ register CHAR_TYPE *pto = end;
+ register CHAR_TYPE *pfrom = end - num;
+
+ while (pfrom >= loc)
+ *pto-- = *pfrom--;
+}
+#endif /* MBS_SUPPORT */
+
+#ifdef MBS_SUPPORT
+static reg_errcode_t
+compile_range (range_start_char, p_ptr, pend, translate, syntax, b,
+ char_set)
+ CHAR_TYPE range_start_char;
+ const CHAR_TYPE **p_ptr, *pend;
+ CHAR_TYPE *char_set, *b;
+ RE_TRANSLATE_TYPE translate;
+ reg_syntax_t syntax;
+{
+ const CHAR_TYPE *p = *p_ptr;
+ CHAR_TYPE range_start, range_end;
+ reg_errcode_t ret;
+# ifdef _LIBC
+ uint32_t nrules;
+ uint32_t start_val, end_val;
+# endif
+ if (p == pend)
+ return REG_ERANGE;
+
+# ifdef _LIBC
+ nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+ if (nrules != 0)
+ {
+ const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
+ _NL_COLLATE_COLLSEQWC);
+ const unsigned char *extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
+
+ if (range_start_char < -1)
+ {
+ /* range_start is a collating symbol. */
+ int32_t *wextra;
+ /* Retreive the index and get collation sequence value. */
+ wextra = (int32_t*)(extra + char_set[-range_start_char]);
+ start_val = wextra[1 + *wextra];
+ }
+ else
+ start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
+
+ end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
+
+ /* Report an error if the range is empty and the syntax prohibits
+ this. */
+ ret = ((syntax & RE_NO_EMPTY_RANGES)
+ && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
+ /* Insert space to the end of the char_ranges. */
+ insert_space(2, b - char_set[5] - 2, b - 1);
+ *(b - char_set[5] - 2) = (wchar_t)start_val;
+ *(b - char_set[5] - 1) = (wchar_t)end_val;
+ char_set[4]++; /* ranges_index */
+ }
+ else
+# endif
+ {
+ range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
+ range_start_char;
+ range_end = TRANSLATE (p[0]);
+ /* Report an error if the range is empty and the syntax prohibits
+ this. */
+ ret = ((syntax & RE_NO_EMPTY_RANGES)
+ && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
+
+ /* Insert space to the end of the char_ranges. */
+ insert_space(2, b - char_set[5] - 2, b - 1);
+ *(b - char_set[5] - 2) = range_start;
+ *(b - char_set[5] - 1) = range_end;
+ char_set[4]++; /* ranges_index */
+ }
+ /* Have to increment the pointer into the pattern string, so the
+ caller isn't still at the ending character. */
+ (*p_ptr)++;
+
+ return ret;
+}
+#else
/* Read the ending character of a range (in a bracket expression) from the
uncompiled pattern *P_PTR (which ends at PEND). We assume the
starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
@@ -3135,27 +4449,27 @@ group_in_compile_stack (compile_stack, regnum)
`regex_compile' itself. */
static reg_errcode_t
-compile_range (p_ptr, pend, translate, syntax, b)
- const char **p_ptr, *pend;
- RE_TRANSLATE_TYPE translate;
- reg_syntax_t syntax;
- unsigned char *b;
+compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
+ unsigned int range_start_char;
+ const char **p_ptr, *pend;
+ RE_TRANSLATE_TYPE translate;
+ reg_syntax_t syntax;
+ unsigned char *b;
{
unsigned this_char;
-
const char *p = *p_ptr;
reg_errcode_t ret;
- char range_start[2];
- char range_end[2];
+# if _LIBC
+ const unsigned char *collseq;
+ unsigned int start_colseq;
+ unsigned int end_colseq;
+# else
+ unsigned end_char;
+# endif
if (p == pend)
return REG_ERANGE;
- /* Fetch the endpoints without translating them; the
- appropriate translation is done in the bit-setting loop below. */
- range_start[0] = p[-2]; range_start[1] = '\0';
- range_end[0] = p[ 0]; range_end[1] = '\0';
-
/* Have to increment the pointer into the pattern string, so the
caller isn't still at the ending character. */
(*p_ptr)++;
@@ -3163,22 +4477,44 @@ compile_range (p_ptr, pend, translate, syntax, b)
/* Report an error if the range is empty and the syntax prohibits this. */
ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
- /* Here we see why `this_char' has to be larger than an `unsigned
- char' -- we would otherwise go into an infinite
- loop, since all characters <= 0xff. */
- for (this_char = 0; this_char <= (unsigned char) -1; this_char++)
+# if _LIBC
+ collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
+ _NL_COLLATE_COLLSEQMB);
+
+ start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
+ end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
+ for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
{
- char ch[2];
- ch[0] = this_char; ch[1] = '\0';
- if (strcoll (range_start, ch) <= 0 && strcoll (ch, range_end) <= 0)
+ unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
+
+ if (start_colseq <= this_colseq && this_colseq <= end_colseq)
{
SET_LIST_BIT (TRANSLATE (this_char));
ret = REG_NOERROR;
}
}
+# else
+ /* Here we see why `this_char' has to be larger than an `unsigned
+ char' -- we would otherwise go into an infinite loop, since all
+ characters <= 0xff. */
+ range_start_char = TRANSLATE (range_start_char);
+ /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
+ and some compilers cast it to int implicitly, so following for_loop
+ may fall to (almost) infinite loop.
+ e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
+ To avoid this, we cast p[0] to unsigned int and truncate it. */
+ end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
+
+ for (this_char = range_start_char; this_char <= end_char; ++this_char)
+ {
+ SET_LIST_BIT (TRANSLATE (this_char));
+ ret = REG_NOERROR;
+ }
+# endif
return ret;
}
+#endif /* MBS_SUPPORT */
/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
@@ -3193,6 +4529,21 @@ compile_range (p_ptr, pend, translate, syntax, b)
Returns 0 if we succeed, -2 if an internal error. */
+#ifdef MBS_SUPPORT
+/* local function for re_compile_fastmap.
+ truncate wchar_t character to char. */
+static unsigned char truncate_wchar (CHAR_TYPE c);
+
+static unsigned char
+truncate_wchar (c)
+ CHAR_TYPE c;
+{
+ unsigned char buf[MB_LEN_MAX];
+ int retval = wctomb(buf, c);
+ return retval > 0 ? buf[0] : (unsigned char)c;
+}
+#endif /* MBS_SUPPORT */
+
int
re_compile_fastmap (bufp)
struct re_pattern_buffer *bufp;
@@ -3206,9 +4557,17 @@ re_compile_fastmap (bufp)
#endif
register char *fastmap = bufp->fastmap;
- unsigned char *pattern = bufp->buffer;
- unsigned char *p = pattern;
- register unsigned char *pend = pattern + bufp->used;
+
+#ifdef MBS_SUPPORT
+ /* We need to cast pattern to (wchar_t*), because we casted this compiled
+ pattern to (char*) in regex_compile. */
+ US_CHAR_TYPE *pattern = (US_CHAR_TYPE*)bufp->buffer;
+ register US_CHAR_TYPE *pend = (US_CHAR_TYPE*) (bufp->buffer + bufp->used);
+#else
+ US_CHAR_TYPE *pattern = bufp->buffer;
+ register US_CHAR_TYPE *pend = pattern + bufp->used;
+#endif /* MBS_SUPPORT */
+ US_CHAR_TYPE *p = pattern;
#ifdef REL_ALLOC
/* This holds the pointer to the failure stack, when
@@ -3271,11 +4630,30 @@ re_compile_fastmap (bufp)
/* Following are the cases which match a character. These end
with `break'. */
+#ifdef MBS_SUPPORT
+ case exactn:
+ fastmap[truncate_wchar(p[1])] = 1;
+ break;
+ case exactn_bin:
+ fastmap[p[1]] = 1;
+ break;
+#else
case exactn:
fastmap[p[1]] = 1;
break;
+#endif /* MBS_SUPPORT */
+#ifdef MBS_SUPPORT
+ /* It is hard to distinguish fastmap from (multi byte) characters
+ which depends on current locale. */
+ case charset:
+ case charset_not:
+ case wordchar:
+ case notwordchar:
+ bufp->can_be_null = 1;
+ goto done;
+#else
case charset:
for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
@@ -3306,7 +4684,7 @@ re_compile_fastmap (bufp)
if (SYNTAX (j) != Sword)
fastmap[j] = 1;
break;
-
+#endif
case anychar:
{
@@ -3436,13 +4814,13 @@ re_compile_fastmap (bufp)
case succeed_n:
/* Get to the number of times to succeed. */
- p += 2;
+ p += OFFSET_ADDRESS_SIZE;
/* Increment p past the n for when k != 0. */
EXTRACT_NUMBER_AND_INCR (k, p);
if (k == 0)
{
- p -= 4;
+ p -= 2 * OFFSET_ADDRESS_SIZE;
succeed_n_p = true; /* Spaghetti code alert. */
goto handle_on_failure_jump;
}
@@ -3450,7 +4828,7 @@ re_compile_fastmap (bufp)
case set_number_at:
- p += 4;
+ p += 2 * OFFSET_ADDRESS_SIZE;
continue;
@@ -3527,7 +4905,7 @@ weak_alias (__re_set_registers, re_set_registers)
/* Searching routines. */
/* Like re_search_2, below, but only one string is specified, and
- doesn't let you say where to stop matching. */
+ doesn't let you say where to stop matching. */
int
re_search (bufp, string, size, startpos, range, regs)
@@ -3658,9 +5036,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
}
else /* Searching backwards. */
{
- register char c = (size1 == 0 || startpos >= size1
- ? string2[startpos - size1]
- : string1[startpos]);
+ register CHAR_TYPE c = (size1 == 0 || startpos >= size1
+ ? string2[startpos - size1]
+ : string1[startpos]);
if (!fastmap[(unsigned char) TRANSLATE (c)])
goto advance;
@@ -3706,12 +5084,24 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
weak_alias (__re_search_2, re_search_2)
#endif
+#ifdef MBS_SUPPORT
+/* This converts PTR, a pointer into one of the search wchar_t strings
+ `string1' and `string2' into an multibyte string offset from the
+ beginning of that string. We use mbs_offset to optimize.
+ See convert_mbs_to_wcs. */
+# define POINTER_TO_OFFSET(ptr) \
+ (FIRST_STRING_P (ptr) \
+ ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
+ : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
+ + csize1)))
+#else
/* This converts PTR, a pointer into one of the search strings `string1'
and `string2' into an offset from the beginning of that string. */
-#define POINTER_TO_OFFSET(ptr) \
+# define POINTER_TO_OFFSET(ptr) \
(FIRST_STRING_P (ptr) \
? ((regoff_t) ((ptr) - string1)) \
: ((regoff_t) ((ptr) - string2 + size1)))
+#endif /* MBS_SUPPORT */
/* Macros for dealing with the split strings in re_match_2. */
@@ -3741,10 +5131,17 @@ weak_alias (__re_search_2, re_search_2)
two special cases to check for: if past the end of string1, look at
the first character in string2; and if before the beginning of
string2, look at the last character in string1. */
-#define WORDCHAR_P(d) \
+#ifdef MBS_SUPPORT
+/* Use internationalized API instead of SYNTAX. */
+# define WORDCHAR_P(d) \
+ (iswalnum ((wint_t)((d) == end1 ? *string2 \
+ : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0)
+#else
+# define WORDCHAR_P(d) \
(SYNTAX ((d) == end1 ? *string2 \
: (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
== Sword)
+#endif /* MBS_SUPPORT */
/* Disabled due to a compiler bug -- see comment at case wordbound */
#if 0
@@ -3758,7 +5155,26 @@ weak_alias (__re_search_2, re_search_2)
/* Free everything we malloc. */
#ifdef MATCH_MAY_ALLOCATE
# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
-# define FREE_VARIABLES() \
+# ifdef MBS_SUPPORT
+# define FREE_VARIABLES() \
+ do { \
+ REGEX_FREE_STACK (fail_stack.stack); \
+ FREE_VAR (regstart); \
+ FREE_VAR (regend); \
+ FREE_VAR (old_regstart); \
+ FREE_VAR (old_regend); \
+ FREE_VAR (best_regstart); \
+ FREE_VAR (best_regend); \
+ FREE_VAR (reg_info); \
+ FREE_VAR (reg_dummy); \
+ FREE_VAR (reg_info_dummy); \
+ FREE_VAR (string1); \
+ FREE_VAR (string2); \
+ FREE_VAR (mbs_offset1); \
+ FREE_VAR (mbs_offset2); \
+ } while (0)
+# else /* not MBS_SUPPORT */
+# define FREE_VARIABLES() \
do { \
REGEX_FREE_STACK (fail_stack.stack); \
FREE_VAR (regstart); \
@@ -3771,8 +5187,20 @@ weak_alias (__re_search_2, re_search_2)
FREE_VAR (reg_dummy); \
FREE_VAR (reg_info_dummy); \
} while (0)
+# endif /* MBS_SUPPORT */
#else
-# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
+# define FREE_VAR(var) if (var) free (var); var = NULL
+# ifdef MBS_SUPPORT
+# define FREE_VARIABLES() \
+ do { \
+ FREE_VAR (string1); \
+ FREE_VAR (string2); \
+ FREE_VAR (mbs_offset1); \
+ FREE_VAR (mbs_offset2); \
+ } while (0)
+# else
+# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
+# endif /* MBS_SUPPORT */
#endif /* not MATCH_MAY_ALLOCATE */
/* These values must meet several constraints. They must not be valid
@@ -3811,16 +5239,16 @@ weak_alias (__re_match, re_match)
# endif
#endif /* not emacs */
-static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p,
- unsigned char *end,
+static boolean group_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p,
+ US_CHAR_TYPE *end,
register_info_type *reg_info));
-static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p,
- unsigned char *end,
+static boolean alt_match_null_string_p _RE_ARGS ((US_CHAR_TYPE *p,
+ US_CHAR_TYPE *end,
register_info_type *reg_info));
-static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p,
- unsigned char *end,
+static boolean common_op_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p,
+ US_CHAR_TYPE *end,
register_info_type *reg_info));
-static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2,
+static int bcmp_translate _RE_ARGS ((const CHAR_TYPE *s1, const CHAR_TYPE *s2,
int len, char *translate));
/* re_match_2 matches the compiled pattern in BUFP against the
@@ -3858,38 +5286,97 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
weak_alias (__re_match_2, re_match_2)
#endif
+#ifdef MBS_SUPPORT
+
+static int count_mbs_length PARAMS ((int *, int));
+
+/* This check the substring (from 0, to length) of the multibyte string,
+ to which offset_buffer correspond. And count how many wchar_t_characters
+ the substring occupy. We use offset_buffer to optimization.
+ See convert_mbs_to_wcs. */
+
+static int
+count_mbs_length(offset_buffer, length)
+ int *offset_buffer;
+ int length;
+{
+ int wcs_size;
+
+ /* Check whether the size is valid. */
+ if (length < 0)
+ return -1;
+
+ if (offset_buffer == NULL)
+ return 0;
+
+ for (wcs_size = 0 ; offset_buffer[wcs_size] != -1 ; wcs_size++)
+ {
+ if (offset_buffer[wcs_size] == length)
+ return wcs_size;
+ if (offset_buffer[wcs_size] > length)
+ /* It is a fragment of a wide character. */
+ return -1;
+ }
+
+ /* We reached at the sentinel. */
+ return -1;
+}
+#endif /* MBS_SUPPORT */
+
/* This is a separate function so that we can force an alloca cleanup
afterwards. */
static int
+#ifdef MBS_SUPPORT
+re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos, regs, stop)
+ struct re_pattern_buffer *bufp;
+ const char *cstring1, *cstring2;
+ int csize1, csize2;
+#else
re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
struct re_pattern_buffer *bufp;
const char *string1, *string2;
int size1, size2;
+#endif
int pos;
struct re_registers *regs;
int stop;
{
/* General temporaries. */
int mcnt;
- unsigned char *p1;
+ US_CHAR_TYPE *p1;
+#ifdef MBS_SUPPORT
+ /* We need wchar_t* buffers correspond to string1, string2. */
+ CHAR_TYPE *string1 = NULL, *string2 = NULL;
+ /* We need the size of wchar_t buffers correspond to csize1, csize2. */
+ int size1 = 0, size2 = 0;
+ /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
+ int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
+ /* They hold whether each wchar_t is binary data or not. */
+ char *is_binary = NULL;
+#endif /* MBS_SUPPORT */
/* Just past the end of the corresponding string. */
- const char *end1, *end2;
+ const CHAR_TYPE *end1, *end2;
/* Pointers into string1 and string2, just past the last characters in
each to consider matching. */
- const char *end_match_1, *end_match_2;
+ const CHAR_TYPE *end_match_1, *end_match_2;
/* Where we are in the data, and the end of the current string. */
- const char *d, *dend;
+ const CHAR_TYPE *d, *dend;
/* Where we are in the pattern, and the end of the pattern. */
- unsigned char *p = bufp->buffer;
- register unsigned char *pend = p + bufp->used;
+#ifdef MBS_SUPPORT
+ US_CHAR_TYPE *pattern, *p;
+ register US_CHAR_TYPE *pend;
+#else
+ US_CHAR_TYPE *p = bufp->buffer;
+ register US_CHAR_TYPE *pend = p + bufp->used;
+#endif /* MBS_SUPPORT */
/* Mark the opcode just after a start_memory, so we can test for an
empty subpattern when we get to the stop_memory. */
- unsigned char *just_past_start_mem = 0;
+ US_CHAR_TYPE *just_past_start_mem = 0;
/* We use this to map every character in the string. */
RE_TRANSLATE_TYPE translate = bufp->translate;
@@ -3934,7 +5421,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
stopped matching the regnum-th subexpression. (The zeroth register
keeps track of what the whole pattern matches.) */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **regstart, **regend;
+ const CHAR_TYPE **regstart, **regend;
#endif
/* If a group that's operated upon by a repetition operator fails to
@@ -3943,7 +5430,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
are when we last see its open-group operator. Similarly for a
register's end. */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **old_regstart, **old_regend;
+ const CHAR_TYPE **old_regstart, **old_regend;
#endif
/* The is_active field of reg_info helps us keep track of which (possibly
@@ -3962,7 +5449,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
turn happens only if we have not yet matched the entire string. */
unsigned best_regs_set = false;
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **best_regstart, **best_regend;
+ const CHAR_TYPE **best_regstart, **best_regend;
#endif
/* Logically, this is `best_regend[0]'. But we don't want to have to
@@ -3973,14 +5460,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
the end of the best match so far in a separate variable. We
initialize this to NULL so that when we backtrack the first time
and need to test it, it's not garbage. */
- const char *match_end = NULL;
+ const CHAR_TYPE *match_end = NULL;
/* This helps SET_REGS_MATCHED avoid doing redundant work. */
int set_regs_matched_done = 0;
/* Used when we pop values we don't care about. */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **reg_dummy;
+ const CHAR_TYPE **reg_dummy;
register_info_type *reg_info_dummy;
#endif
@@ -4001,14 +5488,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
array indexing. We should fix this. */
if (bufp->re_nsub)
{
- regstart = REGEX_TALLOC (num_regs, const char *);
- regend = REGEX_TALLOC (num_regs, const char *);
- old_regstart = REGEX_TALLOC (num_regs, const char *);
- old_regend = REGEX_TALLOC (num_regs, const char *);
- best_regstart = REGEX_TALLOC (num_regs, const char *);
- best_regend = REGEX_TALLOC (num_regs, const char *);
+ regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
+ regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
+ old_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
+ old_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
+ best_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
+ best_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
reg_info = REGEX_TALLOC (num_regs, register_info_type);
- reg_dummy = REGEX_TALLOC (num_regs, const char *);
+ reg_dummy = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
if (!(regstart && regend && old_regstart && old_regend && reg_info
@@ -4029,12 +5516,63 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
#endif /* MATCH_MAY_ALLOCATE */
/* The starting position is bogus. */
+#ifdef MBS_SUPPORT
+ if (pos < 0 || pos > csize1 + csize2)
+#else
if (pos < 0 || pos > size1 + size2)
+#endif
{
FREE_VARIABLES ();
return -1;
}
+#ifdef MBS_SUPPORT
+ /* Allocate wchar_t array for string1 and string2 and
+ fill them with converted string. */
+ if (csize1 != 0)
+ {
+ string1 = REGEX_TALLOC (csize1 + 1, CHAR_TYPE);
+ mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
+ is_binary = REGEX_TALLOC (csize1 + 1, char);
+ if (!string1 || !mbs_offset1 || !is_binary)
+ {
+ FREE_VAR (string1);
+ FREE_VAR (mbs_offset1);
+ FREE_VAR (is_binary);
+ return -2;
+ }
+ size1 = convert_mbs_to_wcs(string1, cstring1, csize1,
+ mbs_offset1, is_binary);
+ string1[size1] = L'\0'; /* for a sentinel */
+ FREE_VAR (is_binary);
+ }
+ if (csize2 != 0)
+ {
+ string2 = REGEX_TALLOC (csize2 + 1, CHAR_TYPE);
+ mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
+ is_binary = REGEX_TALLOC (csize2 + 1, char);
+ if (!string2 || !mbs_offset2 || !is_binary)
+ {
+ FREE_VAR (string1);
+ FREE_VAR (mbs_offset1);
+ FREE_VAR (string2);
+ FREE_VAR (mbs_offset2);
+ FREE_VAR (is_binary);
+ return -2;
+ }
+ size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
+ mbs_offset2, is_binary);
+ string2[size2] = L'\0'; /* for a sentinel */
+ FREE_VAR (is_binary);
+ }
+
+ /* We need to cast pattern to (wchar_t*), because we casted this compiled
+ pattern to (char*) in regex_compile. */
+ p = pattern = (CHAR_TYPE*)bufp->buffer;
+ pend = (CHAR_TYPE*)(bufp->buffer + bufp->used);
+
+#endif /* MBS_SUPPORT */
+
/* Initialize subexpression text positions to -1 to mark ones that no
start_memory/stop_memory has been seen for. Also initialize the
register information struct. */
@@ -4062,6 +5600,25 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
end2 = string2 + size2;
/* Compute where to stop matching, within the two strings. */
+#ifdef MBS_SUPPORT
+ if (stop <= csize1)
+ {
+ mcnt = count_mbs_length(mbs_offset1, stop);
+ end_match_1 = string1 + mcnt;
+ end_match_2 = string2;
+ }
+ else
+ {
+ end_match_1 = end1;
+ mcnt = count_mbs_length(mbs_offset2, stop-csize1);
+ end_match_2 = string2 + mcnt;
+ }
+ if (mcnt < 0)
+ { /* count_mbs_length return error. */
+ FREE_VARIABLES ();
+ return -1;
+ }
+#else
if (stop <= size1)
{
end_match_1 = string1 + stop;
@@ -4072,6 +5629,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
end_match_1 = end1;
end_match_2 = string2 + stop - size1;
}
+#endif /* MBS_SUPPORT */
/* `p' scans through the pattern as `d' scans through the data.
`dend' is the end of the input string that `d' points within. `d'
@@ -4079,6 +5637,26 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
this happens before fetching; therefore, at the beginning of the
loop, `d' can be pointing at the end of a string, but it cannot
equal `string2'. */
+#ifdef MBS_SUPPORT
+ if (size1 > 0 && pos <= csize1)
+ {
+ mcnt = count_mbs_length(mbs_offset1, pos);
+ d = string1 + mcnt;
+ dend = end_match_1;
+ }
+ else
+ {
+ mcnt = count_mbs_length(mbs_offset2, pos-csize1);
+ d = string2 + mcnt;
+ dend = end_match_2;
+ }
+
+ if (mcnt < 0)
+ { /* count_mbs_length return error. */
+ FREE_VARIABLES ();
+ return -1;
+ }
+#else
if (size1 > 0 && pos <= size1)
{
d = string1 + pos;
@@ -4089,6 +5667,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
d = string2 + pos - size1;
dend = end_match_2;
}
+#endif /* MBS_SUPPORT */
DEBUG_PRINT1 ("The compiled pattern is:\n");
DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
@@ -4178,11 +5757,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
succeed_label:
DEBUG_PRINT1 ("Accepting match.\n");
-
/* If caller wants register contents data back, do it. */
if (regs && !bufp->no_sub)
{
- /* Have the register data arrays been allocated? */
+ /* Have the register data arrays been allocated? */
if (bufp->regs_allocated == REGS_UNALLOCATED)
{ /* No. So allocate them with malloc. We need one
extra element beyond `num_regs' for the `-1' marker
@@ -4226,9 +5804,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
if (regs->num_regs > 0)
{
regs->start[0] = pos;
+#ifdef MBS_SUPPORT
+ if (MATCHING_IN_FIRST_STRING)
+ regs->end[0] = mbs_offset1 != NULL ?
+ mbs_offset1[d-string1] : 0;
+ else
+ regs->end[0] = csize1 + (mbs_offset2 != NULL ?
+ mbs_offset2[d-string2] : 0);
+#else
regs->end[0] = (MATCHING_IN_FIRST_STRING
? ((regoff_t) (d - string1))
: ((regoff_t) (d - string2 + size1)));
+#endif /* MBS_SUPPORT */
}
/* Go through the first `min (num_regs, regs->num_regs)'
@@ -4261,9 +5848,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
nfailure_points_pushed - nfailure_points_popped);
DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
+#ifdef MBS_SUPPORT
+ if (MATCHING_IN_FIRST_STRING)
+ mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
+ else
+ mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
+ csize1;
+ mcnt -= pos;
+#else
mcnt = d - pos - (MATCHING_IN_FIRST_STRING
? string1
: string2 - size1);
+#endif /* MBS_SUPPORT */
DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
@@ -4288,6 +5884,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
byte in the pattern defines n, and the n bytes after that
are the characters to match. */
case exactn:
+#ifdef MBS_SUPPORT
+ case exactn_bin:
+#endif
mcnt = *p++;
DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
@@ -4298,9 +5897,23 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
do
{
PREFETCH ();
- if ((unsigned char) translate[(unsigned char) *d++]
- != (unsigned char) *p++)
+#ifdef MBS_SUPPORT
+ if (*d <= 0xff)
+ {
+ if ((US_CHAR_TYPE) translate[(unsigned char) *d++]
+ != (US_CHAR_TYPE) *p++)
+ goto fail;
+ }
+ else
+ {
+ if (*d++ != (CHAR_TYPE) *p++)
+ goto fail;
+ }
+#else
+ if ((US_CHAR_TYPE) translate[(unsigned char) *d++]
+ != (US_CHAR_TYPE) *p++)
goto fail;
+#endif /* MBS_SUPPORT */
}
while (--mcnt);
}
@@ -4309,7 +5922,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
do
{
PREFETCH ();
- if (*d++ != (char) *p++) goto fail;
+ if (*d++ != (CHAR_TYPE) *p++) goto fail;
}
while (--mcnt);
}
@@ -4328,7 +5941,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
goto fail;
SET_REGS_MATCHED ();
- DEBUG_PRINT2 (" Matched `%d'.\n", *d);
+ DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d);
d++;
break;
@@ -4336,14 +5949,358 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
case charset:
case charset_not:
{
- register unsigned char c;
+ register US_CHAR_TYPE c;
+#ifdef MBS_SUPPORT
+ unsigned int i, char_class_length, coll_symbol_length,
+ equiv_class_length, ranges_length, chars_length, length;
+ CHAR_TYPE *workp, *workp2, *charset_top;
+#define WORK_BUFFER_SIZE 128
+ CHAR_TYPE str_buf[WORK_BUFFER_SIZE];
+# ifdef _LIBC
+ uint32_t nrules;
+# endif /* _LIBC */
+#endif /* MBS_SUPPORT */
boolean not = (re_opcode_t) *(p - 1) == charset_not;
DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
-
PREFETCH ();
c = TRANSLATE (*d); /* The character to match. */
+#ifdef MBS_SUPPORT
+# ifdef _LIBC
+ nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+# endif /* _LIBC */
+ charset_top = p - 1;
+ char_class_length = *p++;
+ coll_symbol_length = *p++;
+ equiv_class_length = *p++;
+ ranges_length = *p++;
+ chars_length = *p++;
+ /* p points charset[6], so the address of the next instruction
+ (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
+ where l=length of char_classes, m=length of collating_symbol,
+ n=equivalence_class, o=length of char_range,
+ p'=length of character. */
+ workp = p;
+ /* Update p to indicate the next instruction. */
+ p += char_class_length + coll_symbol_length+ equiv_class_length +
+ 2*ranges_length + chars_length;
+
+ /* match with char_class? */
+ for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
+ {
+ wctype_t wctype;
+ uintptr_t alignedp = ((uintptr_t)workp
+ + __alignof__(wctype_t) - 1)
+ & ~(uintptr_t)(__alignof__(wctype_t) - 1);
+ wctype = *((wctype_t*)alignedp);
+ workp += CHAR_CLASS_SIZE;
+ if (iswctype((wint_t)c, wctype))
+ goto char_set_matched;
+ }
+
+ /* match with collating_symbol? */
+# ifdef _LIBC
+ if (nrules != 0)
+ {
+ const unsigned char *extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
+
+ for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
+ workp++)
+ {
+ int32_t *wextra;
+ wextra = (int32_t*)(extra + *workp++);
+ for (i = 0; i < *wextra; ++i)
+ if (TRANSLATE(d[i]) != wextra[1 + i])
+ break;
+
+ if (i == *wextra)
+ {
+ /* Update d, however d will be incremented at
+ char_set_matched:, we decrement d here. */
+ d += i - 1;
+ goto char_set_matched;
+ }
+ }
+ }
+ else /* (nrules == 0) */
+# endif
+ /* If we can't look up collation data, we use wcscoll
+ instead. */
+ {
+ for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
+ {
+ const CHAR_TYPE *backup_d = d, *backup_dend = dend;
+ length = wcslen(workp);
+
+ /* If wcscoll(the collating symbol, whole string) > 0,
+ any substring of the string never match with the
+ collating symbol. */
+ if (wcscoll(workp, d) > 0)
+ {
+ workp += length + 1;
+ continue;
+ }
+
+ /* First, we compare the collating symbol with
+ the first character of the string.
+ If it don't match, we add the next character to
+ the compare buffer in turn. */
+ for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
+ {
+ int match;
+ if (d == dend)
+ {
+ if (dend == end_match_2)
+ break;
+ d = string2;
+ dend = end_match_2;
+ }
+
+ /* add next character to the compare buffer. */
+ str_buf[i] = TRANSLATE(*d);
+ str_buf[i+1] = '\0';
+
+ match = wcscoll(workp, str_buf);
+ if (match == 0)
+ goto char_set_matched;
+
+ if (match < 0)
+ /* (str_buf > workp) indicate (str_buf + X > workp),
+ because for all X (str_buf + X > str_buf).
+ So we don't need continue this loop. */
+ break;
+
+ /* Otherwise(str_buf < workp),
+ (str_buf+next_character) may equals (workp).
+ So we continue this loop. */
+ }
+ /* not matched */
+ d = backup_d;
+ dend = backup_dend;
+ workp += length + 1;
+ }
+ }
+ /* match with equivalence_class? */
+# ifdef _LIBC
+ if (nrules != 0)
+ {
+ const CHAR_TYPE *backup_d = d, *backup_dend = dend;
+ /* Try to match the equivalence class against
+ those known to the collate implementation. */
+ const int32_t *table;
+ const int32_t *weights;
+ const int32_t *extra;
+ const int32_t *indirect;
+ int32_t idx, idx2;
+ wint_t *cp;
+ size_t len;
+
+ /* This #include defines a local function! */
+# include <locale/weightwc.h>
+
+ table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
+ weights = (const wint_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
+ extra = (const wint_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
+ indirect = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
+
+ /* Write 1 collating element to str_buf, and
+ get its index. */
+ idx2 = 0;
+
+ for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
+ {
+ cp = (wint_t*)str_buf;
+ if (d == dend)
+ {
+ if (dend == end_match_2)
+ break;
+ d = string2;
+ dend = end_match_2;
+ }
+ str_buf[i] = TRANSLATE(*(d+i));
+ str_buf[i+1] = '\0'; /* sentinel */
+ idx2 = findidx ((const wint_t**)&cp);
+ }
+
+ /* Update d, however d will be incremented at
+ char_set_matched:, we decrement d here. */
+ d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
+ if (d >= dend)
+ {
+ if (dend == end_match_2)
+ d = dend;
+ else
+ {
+ d = string2;
+ dend = end_match_2;
+ }
+ }
+
+ len = weights[idx2];
+ for (workp2 = workp + equiv_class_length ; workp < workp2 ;
+ workp++)
+ {
+ idx = (int32_t)*workp;
+ /* We already checked idx != 0 in regex_compile. */
+
+ if (idx2 != 0 && len == weights[idx])
+ {
+ int cnt = 0;
+ while (cnt < len && (weights[idx + 1 + cnt]
+ == weights[idx2 + 1 + cnt]))
+ ++cnt;
+
+ if (cnt == len)
+ goto char_set_matched;
+ }
+ }
+ /* not matched */
+ d = backup_d;
+ dend = backup_dend;
+ }
+ else /* (nrules == 0) */
+# endif
+ /* If we can't look up collation data, we use wcscoll
+ instead. */
+ {
+ for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
+ {
+ const CHAR_TYPE *backup_d = d, *backup_dend = dend;
+ length = wcslen(workp);
+
+ /* If wcscoll(the collating symbol, whole string) > 0,
+ any substring of the string never match with the
+ collating symbol. */
+ if (wcscoll(workp, d) > 0)
+ {
+ workp += length + 1;
+ break;
+ }
+
+ /* First, we compare the equivalence class with
+ the first character of the string.
+ If it don't match, we add the next character to
+ the compare buffer in turn. */
+ for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
+ {
+ int match;
+ if (d == dend)
+ {
+ if (dend == end_match_2)
+ break;
+ d = string2;
+ dend = end_match_2;
+ }
+
+ /* add next character to the compare buffer. */
+ str_buf[i] = TRANSLATE(*d);
+ str_buf[i+1] = '\0';
+
+ match = wcscoll(workp, str_buf);
+
+ if (match == 0)
+ goto char_set_matched;
+
+ if (match < 0)
+ /* (str_buf > workp) indicate (str_buf + X > workp),
+ because for all X (str_buf + X > str_buf).
+ So we don't need continue this loop. */
+ break;
+
+ /* Otherwise(str_buf < workp),
+ (str_buf+next_character) may equals (workp).
+ So we continue this loop. */
+ }
+ /* not matched */
+ d = backup_d;
+ dend = backup_dend;
+ workp += length + 1;
+ }
+ }
+
+ /* match with char_range? */
+#ifdef _LIBC
+ if (nrules != 0)
+ {
+ uint32_t collseqval;
+ const char *collseq = (const char *)
+ _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
+
+ collseqval = collseq_table_lookup (collseq, c);
+
+ for (; workp < p - chars_length ;)
+ {
+ uint32_t start_val, end_val;
+
+ /* We already compute the collation sequence value
+ of the characters (or collating symbols). */
+ start_val = (uint32_t) *workp++; /* range_start */
+ end_val = (uint32_t) *workp++; /* range_end */
+
+ if (start_val <= collseqval && collseqval <= end_val)
+ goto char_set_matched;
+ }
+ }
+ else
+#endif
+ {
+ /* We set range_start_char at str_buf[0], range_end_char
+ at str_buf[4], and compared char at str_buf[2]. */
+ str_buf[1] = 0;
+ str_buf[2] = c;
+ str_buf[3] = 0;
+ str_buf[5] = 0;
+ for (; workp < p - chars_length ;)
+ {
+ wchar_t *range_start_char, *range_end_char;
+
+ /* match if (range_start_char <= c <= range_end_char). */
+
+ /* If range_start(or end) < 0, we assume -range_start(end)
+ is the offset of the collating symbol which is specified
+ as the character of the range start(end). */
+
+ /* range_start */
+ if (*workp < 0)
+ range_start_char = charset_top - (*workp++);
+ else
+ {
+ str_buf[0] = *workp++;
+ range_start_char = str_buf;
+ }
+
+ /* range_end */
+ if (*workp < 0)
+ range_end_char = charset_top - (*workp++);
+ else
+ {
+ str_buf[4] = *workp++;
+ range_end_char = str_buf + 4;
+ }
+
+ if (wcscoll(range_start_char, str_buf+2) <= 0 &&
+ wcscoll(str_buf+2, range_end_char) <= 0)
+
+ goto char_set_matched;
+ }
+ }
+
+ /* match with char? */
+ for (; workp < p ; workp++)
+ if (c == *workp)
+ goto char_set_matched;
+
+ not = !not;
+
+ char_set_matched:
+ if (not) goto fail;
+#else
/* Cast to `unsigned' instead of `unsigned char' in case the
bit list is a full 32 bytes long. */
if (c < (unsigned) (*p * BYTEWIDTH)
@@ -4353,7 +6310,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
p += 1 + *p;
if (!not) goto fail;
-
+#undef WORK_BUFFER_SIZE
+#endif /* MBS_SUPPORT */
SET_REGS_MATCHED ();
d++;
break;
@@ -4366,7 +6324,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
matched within the group is recorded (in the internal
registers data structure) under the register number. */
case start_memory:
- DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
+ DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
+ (long int) *p, (long int) p[1]);
/* Find out if this group can match the empty string. */
p1 = p; /* To send to group_match_null_string_p. */
@@ -4414,7 +6373,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
arguments are the same as start_memory's: the register
number, and the number of inner groups. */
case stop_memory:
- DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
+ DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
+ (long int) *p, (long int) p[1]);
/* We need to save the string position the last time we were at
this close-group operator in case the group is operated
@@ -4448,7 +6408,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
it isn't necessarily one less than now: consider
(a(b)c(d(e)f)g). When group 3 ends, after the f), the
new highest active register is 1. */
- unsigned char r = *p - 1;
+ US_CHAR_TYPE r = *p - 1;
while (r > 0 && !IS_ACTIVE (reg_info[r]))
r--;
@@ -4491,7 +6451,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
case dummy_failure_jump:
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
if (is_a_jump_n)
- p1 += 2;
+ p1 += OFFSET_ADDRESS_SIZE;
break;
default:
@@ -4505,7 +6465,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
by forcing a failure after pushing on the stack the
on_failure_jump's jump in the pattern, and d. */
if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
- && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
+ && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
+ && p1[2+OFFSET_ADDRESS_SIZE] == *p)
{
/* If this group ever matched anything, then restore
what its registers were before trying this last
@@ -4551,7 +6512,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
followed by the numeric value of <digit> as the register number. */
case duplicate:
{
- register const char *d2, *dend2;
+ register const CHAR_TYPE *d2, *dend2;
int regno = *p++; /* Get which register to match against. */
DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
@@ -4601,7 +6562,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
past them. */
if (translate
? bcmp_translate (d, d2, mcnt, translate)
- : memcmp (d, d2, mcnt))
+ : memcmp (d, d2, mcnt*sizeof(US_CHAR_TYPE)))
goto fail;
d += mcnt, d2 += mcnt;
@@ -4757,7 +6718,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
EXTRACT_NUMBER_AND_INCR (mcnt, p);
DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
{
- register unsigned char *p2 = p;
+ register US_CHAR_TYPE *p2 = p;
/* Compare the beginning of the repeat with what in the
pattern follows its end. If we can establish that there
@@ -4782,9 +6743,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
&& ((re_opcode_t) *p2 == stop_memory
|| (re_opcode_t) *p2 == start_memory))
p2 += 3;
- else if (p2 + 6 < pend
+ else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
&& (re_opcode_t) *p2 == dummy_failure_jump)
- p2 += 6;
+ p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
else
break;
}
@@ -4800,30 +6761,48 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Consider what happens when matching ":\(.*\)"
against ":/". I don't really understand this code
yet. */
- p[-3] = (unsigned char) pop_failure_jump;
+ p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE)
+ pop_failure_jump;
DEBUG_PRINT1
(" End of pattern: change to `pop_failure_jump'.\n");
}
else if ((re_opcode_t) *p2 == exactn
+#ifdef MBS_SUPPORT
+ || (re_opcode_t) *p2 == exactn_bin
+#endif
|| (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
{
- register unsigned char c
- = *p2 == (unsigned char) endline ? '\n' : p2[2];
+ register US_CHAR_TYPE c
+ = *p2 == (US_CHAR_TYPE) endline ? '\n' : p2[2];
- if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
+ if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
+#ifdef MBS_SUPPORT
+ || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
+#endif
+ ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
{
- p[-3] = (unsigned char) pop_failure_jump;
- DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
- c, p1[5]);
+ p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE)
+ pop_failure_jump;
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX != 1)
+ DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
+ (wint_t) c,
+ (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
+ else
+#endif
+ DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
+ (char) c,
+ (char) p1[3+OFFSET_ADDRESS_SIZE]);
}
+#ifndef MBS_SUPPORT
else if ((re_opcode_t) p1[3] == charset
|| (re_opcode_t) p1[3] == charset_not)
{
int not = (re_opcode_t) p1[3] == charset_not;
- if (c < (unsigned char) (p1[4] * BYTEWIDTH)
+ if (c < (unsigned) (p1[4] * BYTEWIDTH)
&& p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
not = !not;
@@ -4835,7 +6814,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
}
}
+#endif /* not MBS_SUPPORT */
}
+#ifndef MBS_SUPPORT
else if ((re_opcode_t) *p2 == charset)
{
/* We win if the first character of the loop is not part
@@ -4884,11 +6865,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
}
}
}
+#endif /* not MBS_SUPPORT */
}
- p -= 2; /* Point at relative address again. */
+ p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
if ((re_opcode_t) p[-1] != pop_failure_jump)
{
- p[-1] = (unsigned char) jump;
+ p[-1] = (US_CHAR_TYPE) jump;
DEBUG_PRINT1 (" Match => jump.\n");
goto unconditional_jump;
}
@@ -4909,8 +6891,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
register from the stack, since lowest will == highest in
`pop_failure_point'. */
active_reg_t dummy_low_reg, dummy_high_reg;
- unsigned char *pdummy;
- const char *sdummy;
+ US_CHAR_TYPE *pdummy = NULL;
+ const CHAR_TYPE *sdummy = NULL;
DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
POP_FAILURE_POINT (sdummy, pdummy,
@@ -4975,7 +6957,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Have to succeed matching what follows at least n times.
After that, handle like `on_failure_jump'. */
case succeed_n:
- EXTRACT_NUMBER (mcnt, p + 2);
+ EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
assert (mcnt >= 0);
@@ -4983,46 +6965,58 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
if (mcnt > 0)
{
mcnt--;
- p += 2;
+ p += OFFSET_ADDRESS_SIZE;
STORE_NUMBER_AND_INCR (p, mcnt);
#ifdef _LIBC
- DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt);
+ DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
+ , mcnt);
#else
- DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt);
+ DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
+ , mcnt);
#endif
}
else if (mcnt == 0)
{
#ifdef _LIBC
- DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
+ DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
+ p + OFFSET_ADDRESS_SIZE);
#else
- DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
-#endif
- p[2] = (unsigned char) no_op;
- p[3] = (unsigned char) no_op;
+ DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
+ p + OFFSET_ADDRESS_SIZE);
+#endif /* _LIBC */
+
+#ifdef MBS_SUPPORT
+ p[1] = (US_CHAR_TYPE) no_op;
+#else
+ p[2] = (US_CHAR_TYPE) no_op;
+ p[3] = (US_CHAR_TYPE) no_op;
+#endif /* MBS_SUPPORT */
goto on_failure;
}
break;
case jump_n:
- EXTRACT_NUMBER (mcnt, p + 2);
+ EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
/* Originally, this is how many times we CAN jump. */
if (mcnt)
{
mcnt--;
- STORE_NUMBER (p + 2, mcnt);
+ STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
+
#ifdef _LIBC
- DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt);
+ DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
+ mcnt);
#else
- DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt);
-#endif
+ DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
+ mcnt);
+#endif /* _LIBC */
goto unconditional_jump;
}
/* If don't have to jump any more, skip over the rest of command. */
else
- p += 4;
+ p += 2 * OFFSET_ADDRESS_SIZE;
break;
case set_number_at:
@@ -5254,12 +7248,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
static boolean
group_match_null_string_p (p, end, reg_info)
- unsigned char **p, *end;
+ US_CHAR_TYPE **p, *end;
register_info_type *reg_info;
{
int mcnt;
/* Point to after the args to the start_memory. */
- unsigned char *p1 = *p + 2;
+ US_CHAR_TYPE *p1 = *p + 2;
while (p1 < end)
{
@@ -5297,14 +7291,16 @@ group_match_null_string_p (p, end, reg_info)
with an on_failure_jump (see above) that jumps to right
past a jump_past_alt. */
- while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
+ while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
+ jump_past_alt)
{
/* `mcnt' holds how many bytes long the alternative
is, including the ending `jump_past_alt' and
its number. */
- if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
- reg_info))
+ if (!alt_match_null_string_p (p1, p1 + mcnt -
+ (1 + OFFSET_ADDRESS_SIZE),
+ reg_info))
return false;
/* Move to right after this alternative, including the
@@ -5320,10 +7316,11 @@ group_match_null_string_p (p, end, reg_info)
alternative that starts with an on_failure_jump. */
p1++;
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
+ if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
+ jump_past_alt)
{
/* Get to the beginning of the n-th alternative. */
- p1 -= 3;
+ p1 -= 1 + OFFSET_ADDRESS_SIZE;
break;
}
}
@@ -5331,7 +7328,7 @@ group_match_null_string_p (p, end, reg_info)
/* Deal with the last alternative: go back and get number
of the `jump_past_alt' just before it. `mcnt' contains
the length of the alternative. */
- EXTRACT_NUMBER (mcnt, p1 - 2);
+ EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
return false;
@@ -5363,11 +7360,11 @@ group_match_null_string_p (p, end, reg_info)
static boolean
alt_match_null_string_p (p, end, reg_info)
- unsigned char *p, *end;
+ US_CHAR_TYPE *p, *end;
register_info_type *reg_info;
{
int mcnt;
- unsigned char *p1 = p;
+ US_CHAR_TYPE *p1 = p;
while (p1 < end)
{
@@ -5400,13 +7397,13 @@ alt_match_null_string_p (p, end, reg_info)
static boolean
common_op_match_null_string_p (p, end, reg_info)
- unsigned char **p, *end;
+ US_CHAR_TYPE **p, *end;
register_info_type *reg_info;
{
int mcnt;
boolean ret;
int reg_no;
- unsigned char *p1 = *p;
+ US_CHAR_TYPE *p1 = *p;
switch ((re_opcode_t) *p1++)
{
@@ -5452,12 +7449,12 @@ common_op_match_null_string_p (p, end, reg_info)
case succeed_n:
/* Get to the number of times to succeed. */
- p1 += 2;
+ p1 += OFFSET_ADDRESS_SIZE;
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
if (mcnt == 0)
{
- p1 -= 4;
+ p1 -= 2 * OFFSET_ADDRESS_SIZE;
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
p1 += mcnt;
}
@@ -5471,7 +7468,7 @@ common_op_match_null_string_p (p, end, reg_info)
break;
case set_number_at:
- p1 += 4;
+ p1 += 2 * OFFSET_ADDRESS_SIZE;
default:
/* All other opcodes mean we cannot match the empty string. */
@@ -5488,15 +7485,21 @@ common_op_match_null_string_p (p, end, reg_info)
static int
bcmp_translate (s1, s2, len, translate)
- const char *s1, *s2;
+ const CHAR_TYPE *s1, *s2;
register int len;
RE_TRANSLATE_TYPE translate;
{
- register const unsigned char *p1 = (const unsigned char *) s1;
- register const unsigned char *p2 = (const unsigned char *) s2;
+ register const US_CHAR_TYPE *p1 = (const US_CHAR_TYPE *) s1;
+ register const US_CHAR_TYPE *p2 = (const US_CHAR_TYPE *) s2;
while (len)
{
+#ifdef MBS_SUPPORT
+ if (((*p1<=0xff)?translate[*p1++]:*p1++)
+ != ((*p2<=0xff)?translate[*p2++]:*p2++))
+ return 1;
+#else
if (translate[*p1++] != translate[*p2++]) return 1;
+#endif /* MBS_SUPPORT */
len--;
}
return 0;
@@ -5716,7 +7719,7 @@ regcomp (preg, pattern, cflags)
buffer. */
if (re_compile_fastmap (preg) == -2)
{
- /* Some error occured while computing the fastmap, just forget
+ /* Some error occurred while computing the fastmap, just forget
about it. */
free (preg->fastmap);
preg->fastmap = NULL;
OpenPOWER on IntegriCloud