diff options
author | tjr <tjr@FreeBSD.org> | 2004-07-04 01:03:02 +0000 |
---|---|---|
committer | tjr <tjr@FreeBSD.org> | 2004-07-04 01:03:02 +0000 |
commit | 5f9e4022f9e75041ea54067801e73f18ffab63ba (patch) | |
tree | 38d6f0c6303600623102f00fe0513f7022e84cdf /gnu/lib | |
parent | 15d9c0f83966da644ce0286d651be247cfdd61ae (diff) | |
download | FreeBSD-src-5f9e4022f9e75041ea54067801e73f18ffab63ba.zip FreeBSD-src-5f9e4022f9e75041ea54067801e73f18ffab63ba.tar.gz |
Import of GNU regex from GNU grep 2.5.1 (trimmed).
Diffstat (limited to 'gnu/lib')
-rw-r--r-- | gnu/lib/libregex/posix/regex.h | 48 | ||||
-rw-r--r-- | gnu/lib/libregex/regex.c | 2847 |
2 files changed, 2461 insertions, 434 deletions
diff --git a/gnu/lib/libregex/posix/regex.h b/gnu/lib/libregex/posix/regex.h index d88ab92..63c2fef 100644 --- a/gnu/lib/libregex/posix/regex.h +++ b/gnu/lib/libregex/posix/regex.h @@ -1,6 +1,6 @@ /* Definitions for data structures and routines for the regular expression library, version 0.12. - Copyright (C) 1985,89,90,91,92,93,95,96,97,98 Free Software Foundation, Inc. + Copyright (C) 1985,1989-1993,1995-1998, 2000 Free Software Foundation, Inc. This file is part of the GNU C Library. Its master source is NOT part of the C library, however. The master source lives in /gd/gnu/lib. @@ -160,6 +160,11 @@ typedef unsigned long int reg_syntax_t; this bit set, and it won't affect anything in the normal case. */ #define RE_DEBUG (RE_NO_GNU_OPS << 1) +/* If this bit is set, a syntactically invalid interval is treated as + a string of ordinary characters. For example, the ERE 'a{1' is + treated as 'a\{1'. */ +#define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1) + /* This global variable defines the particular regexp syntax to use (for some interfaces). When a regexp is compiled, the syntax used is stored in the pattern buffer, so changing this does not affect @@ -199,7 +204,8 @@ extern reg_syntax_t re_syntax_options; | RE_NO_BK_VBAR) #define RE_SYNTAX_POSIX_EGREP \ - (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) + (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES \ + | RE_INVALID_INTERVAL_ORD) /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ #define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC @@ -221,13 +227,13 @@ extern reg_syntax_t re_syntax_options; (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) #define RE_SYNTAX_POSIX_EXTENDED \ - (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ - | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ - | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ - | RE_UNMATCHED_RIGHT_PAREN_ORD) + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ + | RE_CONTEXT_INVALID_OPS | RE_UNMATCHED_RIGHT_PAREN_ORD) -/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS - replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ +/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INDEP_OPS is + removed and RE_NO_BK_REFS is added. */ #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ @@ -513,13 +519,31 @@ extern int re_exec _RE_ARGS ((const char *)); # endif #endif +/* GCC 2.95 and later have "__restrict"; C99 compilers have + "restrict", and "configure" may have defined "restrict". */ +#ifndef __restrict +# if ! (2 < __GNUC__ || (2 == __GNUC__ && 95 <= __GNUC_MINOR__)) +# if defined restrict || 199901L <= __STDC_VERSION__ +# define __restrict restrict +# else +# define __restrict +# endif +# endif +#endif +/* For now unconditionally define __restrict_arr to expand to nothing. + Ideally we would have a test for the compiler which allows defining + it to restrict. */ +#define __restrict_arr + /* POSIX compatibility. */ -extern int regcomp _RE_ARGS ((regex_t *__preg, const char *__pattern, +extern int regcomp _RE_ARGS ((regex_t *__restrict __preg, + const char *__restrict __pattern, int __cflags)); -extern int regexec _RE_ARGS ((const regex_t *__preg, - const char *__string, size_t __nmatch, - regmatch_t __pmatch[], int __eflags)); +extern int regexec _RE_ARGS ((const regex_t *__restrict __preg, + const char *__restrict __string, size_t __nmatch, + regmatch_t __pmatch[__restrict_arr], + int __eflags)); extern size_t regerror _RE_ARGS ((int __errcode, const regex_t *__preg, char *__errbuf, size_t __errbuf_size)); diff --git a/gnu/lib/libregex/regex.c b/gnu/lib/libregex/regex.c index 640e3fe..358071b 100644 --- a/gnu/lib/libregex/regex.c +++ b/gnu/lib/libregex/regex.c @@ -2,7 +2,7 @@ version 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the internationalization features.) - Copyright (C) 1993, 94, 95, 96, 97, 98, 99 Free Software Foundation, Inc. + Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as @@ -56,6 +56,30 @@ # include <wctype.h> #endif +/* This is for multi byte string support. */ +#ifdef MBS_SUPPORT +# define CHAR_TYPE wchar_t +# define US_CHAR_TYPE wchar_t/* unsigned character type */ +# define COMPILED_BUFFER_VAR wc_buffer +# define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */ +# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_TYPE)+1) +# define PUT_CHAR(c) \ + do { \ + if (MB_CUR_MAX == 1) \ + putchar (c); \ + else \ + printf ("%C", (wint_t) c); /* Should we use wide stream?? */ \ + } while (0) +# define TRUE 1 +# define FALSE 0 +#else +# define CHAR_TYPE char +# define US_CHAR_TYPE unsigned char /* unsigned character type */ +# define COMPILED_BUFFER_VAR bufp->buffer +# define OFFSET_ADDRESS_SIZE 2 +# define PUT_CHAR(c) putchar (c) +#endif /* MBS_SUPPORT */ + #ifdef _LIBC /* We have to keep the namespace clean. */ # define regfree(preg) __regfree (preg) @@ -78,19 +102,30 @@ __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) -#define btowc __btowc -#endif +# define btowc __btowc +/* We are also using some library internals. */ +# include <locale/localeinfo.h> +# include <locale/elem-hash.h> +# include <langinfo.h> +# include <locale/coll-lookup.h> +#endif -#ifndef _ -/* This is for other GNU distributions with internationalized messages. - When compiling libc, the _ and N_ macros are predefined. */ -# ifdef HAVE_LIBINTL_H -# include <libintl.h> -# else -# define gettext(msgid) (msgid) +/* This is for other GNU distributions with internationalized messages. */ +#if HAVE_LIBINTL_H || defined _LIBC +# include <libintl.h> +# ifdef _LIBC +# undef gettext +# define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES) # endif -# define N_(msgid) (msgid) +#else +# define gettext(msgid) (msgid) +#endif + +#ifndef gettext_noop +/* This define is so xgettext can find the internationalizable + strings. */ +# define gettext_noop(String) String #endif /* The `emacs' switch turns on certain matching commands @@ -164,6 +199,14 @@ char *realloc (); # endif #endif /* not emacs */ + +#if defined _LIBC || HAVE_LIMITS_H +# include <limits.h> +#endif + +#ifndef MB_LEN_MAX +# define MB_LEN_MAX 1 +#endif /* Get the interface, including the syntax bits. */ #include <regex.h> @@ -247,6 +290,8 @@ extern char *re_syntax_table; static char re_syntax_table[CHAR_SET_SIZE]; +static void init_syntax_once PARAMS ((void)); + static void init_syntax_once () { @@ -268,7 +313,7 @@ init_syntax_once () # endif /* not SYNTAX_TABLE */ -# define SYNTAX(c) re_syntax_table[((c) & 0xFF)] +# define SYNTAX(c) re_syntax_table[(unsigned char) (c)] #endif /* emacs */ @@ -396,6 +441,11 @@ typedef enum /* Followed by one byte giving n, then by n literal bytes. */ exactn, +#ifdef MBS_SUPPORT + /* Same as exactn, but contains binary data. */ + exactn_bin, +#endif + /* Matches any (more or less) character. */ anychar, @@ -405,6 +455,13 @@ typedef enum are ordered low-bit-first. A character is in the set if its bit is 1. A character too large to have a bit in the map is automatically not in the set. */ + /* ifdef MBS_SUPPORT, following element is length of character + classes, length of collating symbols, length of equivalence + classes, length of character ranges, and length of characters. + Next, character class element, collating symbols elements, + equivalence class elements, range elements, and character + elements follow. + See regex_compile function. */ charset, /* Same parameters as charset, but match any character that is @@ -454,6 +511,7 @@ typedef enum /* Followed by two-byte relative address of place to resume at in case of failure. */ + /* ifdef MBS_SUPPORT, the size of address is 1. */ on_failure_jump, /* Like on_failure_jump, but pushes a placeholder instead of the @@ -462,6 +520,7 @@ typedef enum /* Throw away latest failure point and then jump to following two-byte relative address. */ + /* ifdef MBS_SUPPORT, the size of address is 1. */ pop_failure_jump, /* Change to pop_failure_jump if know won't have to backtrack to @@ -471,6 +530,7 @@ typedef enum sure that there is no use backtracking out of repetitions already matched, then we change it to a pop_failure_jump. Followed by two-byte address. */ + /* ifdef MBS_SUPPORT, the size of address is 1. */ maybe_pop_jump, /* Jump to following two-byte address, and push a dummy failure @@ -478,6 +538,7 @@ typedef enum is made to use it for a failure. A `+' construct makes this before the first repeat. Also used as an intermediary kind of jump when compiling an alternative. */ + /* ifdef MBS_SUPPORT, the size of address is 1. */ dummy_failure_jump, /* Push a dummy failure point and continue. Used at the end of @@ -486,15 +547,18 @@ typedef enum /* Followed by two-byte relative address and two-byte number n. After matching N times, jump to the address upon failure. */ + /* ifdef MBS_SUPPORT, the size of address is 1. */ succeed_n, /* Followed by two-byte relative address, and two-byte number n. Jump to the address N times, then fail. */ + /* ifdef MBS_SUPPORT, the size of address is 1. */ jump_n, /* Set the following two-byte relative address to the subsequent two-byte number. The address *includes* the two bytes of number. */ + /* ifdef MBS_SUPPORT, the size of address is 1. */ set_number_at, wordchar, /* Matches any word-constituent character. */ @@ -523,42 +587,63 @@ typedef enum /* Common operations on the compiled pattern. */ /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ +/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ -#define STORE_NUMBER(destination, number) \ +#ifdef MBS_SUPPORT +# define STORE_NUMBER(destination, number) \ + do { \ + *(destination) = (US_CHAR_TYPE)(number); \ + } while (0) +#else +# define STORE_NUMBER(destination, number) \ do { \ (destination)[0] = (number) & 0377; \ (destination)[1] = (number) >> 8; \ } while (0) +#endif /* MBS_SUPPORT */ /* Same as STORE_NUMBER, except increment DESTINATION to the byte after where the number is stored. Therefore, DESTINATION must be an lvalue. */ +/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ #define STORE_NUMBER_AND_INCR(destination, number) \ do { \ STORE_NUMBER (destination, number); \ - (destination) += 2; \ + (destination) += OFFSET_ADDRESS_SIZE; \ } while (0) /* Put into DESTINATION a number stored in two contiguous bytes starting at SOURCE. */ +/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ -#define EXTRACT_NUMBER(destination, source) \ +#ifdef MBS_SUPPORT +# define EXTRACT_NUMBER(destination, source) \ + do { \ + (destination) = *(source); \ + } while (0) +#else +# define EXTRACT_NUMBER(destination, source) \ do { \ (destination) = *(source) & 0377; \ (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ } while (0) +#endif #ifdef DEBUG -static void extract_number _RE_ARGS ((int *dest, unsigned char *source)); +static void extract_number _RE_ARGS ((int *dest, US_CHAR_TYPE *source)); static void extract_number (dest, source) int *dest; - unsigned char *source; + US_CHAR_TYPE *source; { +#ifdef MBS_SUPPORT + *dest = *source; +#else int temp = SIGN_EXTEND_CHAR (*(source + 1)); *dest = *source & 0377; *dest += temp << 8; +#endif } # ifndef EXTRACT_MACROS /* To debug the macros. */ @@ -574,19 +659,19 @@ extract_number (dest, source) #define EXTRACT_NUMBER_AND_INCR(destination, source) \ do { \ EXTRACT_NUMBER (destination, source); \ - (source) += 2; \ + (source) += OFFSET_ADDRESS_SIZE; \ } while (0) #ifdef DEBUG static void extract_number_and_incr _RE_ARGS ((int *destination, - unsigned char **source)); + US_CHAR_TYPE **source)); static void extract_number_and_incr (destination, source) int *destination; - unsigned char **source; + US_CHAR_TYPE **source; { extract_number (destination, *source); - *source += 2; + *source += OFFSET_ADDRESS_SIZE; } # ifndef EXTRACT_MACROS @@ -660,13 +745,13 @@ print_fastmap (fastmap) void print_partial_compiled_pattern (start, end) - unsigned char *start; - unsigned char *end; + US_CHAR_TYPE *start; + US_CHAR_TYPE *end; { int mcnt, mcnt2; - unsigned char *p1; - unsigned char *p = start; - unsigned char *pend = end; + US_CHAR_TYPE *p1; + US_CHAR_TYPE *p = start; + US_CHAR_TYPE *pend = end; if (start == NULL) { @@ -677,7 +762,11 @@ print_partial_compiled_pattern (start, end) /* Loop over pattern commands. */ while (p < pend) { - printf ("%d:\t", p - start); +#ifdef _LIBC + printf ("%td:\t", p - start); +#else + printf ("%ld:\t", (long int) (p - start)); +#endif switch ((re_opcode_t) *p++) { @@ -691,23 +780,35 @@ print_partial_compiled_pattern (start, end) do { putchar ('/'); - putchar (*p++); + PUT_CHAR (*p++); + } + while (--mcnt); + break; + +#ifdef MBS_SUPPORT + case exactn_bin: + mcnt = *p++; + printf ("/exactn_bin/%d", mcnt); + do + { + printf("/%lx", (long int) *p++); } while (--mcnt); break; +#endif /* MBS_SUPPORT */ case start_memory: mcnt = *p++; - printf ("/start_memory/%d/%d", mcnt, *p++); + printf ("/start_memory/%d/%ld", mcnt, (long int) *p++); break; case stop_memory: mcnt = *p++; - printf ("/stop_memory/%d/%d", mcnt, *p++); + printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++); break; case duplicate: - printf ("/duplicate/%d", *p++); + printf ("/duplicate/%ld", (long int) *p++); break; case anychar: @@ -717,6 +818,51 @@ print_partial_compiled_pattern (start, end) case charset: case charset_not: { +#ifdef MBS_SUPPORT + int i, length; + wchar_t *workp = p; + printf ("/charset [%s", + (re_opcode_t) *(workp - 1) == charset_not ? "^" : ""); + p += 5; + length = *workp++; /* the length of char_classes */ + for (i=0 ; i<length ; i++) + printf("[:%lx:]", (long int) *p++); + length = *workp++; /* the length of collating_symbol */ + for (i=0 ; i<length ;) + { + printf("[."); + while(*p != 0) + PUT_CHAR((i++,*p++)); + i++,p++; + printf(".]"); + } + length = *workp++; /* the length of equivalence_class */ + for (i=0 ; i<length ;) + { + printf("[="); + while(*p != 0) + PUT_CHAR((i++,*p++)); + i++,p++; + printf("=]"); + } + length = *workp++; /* the length of char_range */ + for (i=0 ; i<length ; i++) + { + wchar_t range_start = *p++; + wchar_t range_end = *p++; + if (MB_CUR_MAX == 1) + printf("%c-%c", (char) range_start, (char) range_end); + else + printf("%C-%C", (wint_t) range_start, (wint_t) range_end); + } + length = *workp++; /* the length of char */ + for (i=0 ; i<length ; i++) + if (MB_CUR_MAX == 1) + putchar (*p++); + else + printf("%C", (wint_t) *p++); + putchar (']'); +#else register int c, last = -100; register int in_range = 0; @@ -754,6 +900,7 @@ print_partial_compiled_pattern (start, end) putchar (']'); p += 1 + *p; +#endif /* MBS_SUPPORT */ } break; @@ -767,17 +914,30 @@ print_partial_compiled_pattern (start, end) case on_failure_jump: extract_number_and_incr (&mcnt, &p); - printf ("/on_failure_jump to %d", p + mcnt - start); +#ifdef _LIBC + printf ("/on_failure_jump to %td", p + mcnt - start); +#else + printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start)); +#endif break; case on_failure_keep_string_jump: extract_number_and_incr (&mcnt, &p); - printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); +#ifdef _LIBC + printf ("/on_failure_keep_string_jump to %td", p + mcnt - start); +#else + printf ("/on_failure_keep_string_jump to %ld", + (long int) (p + mcnt - start)); +#endif break; case dummy_failure_jump: extract_number_and_incr (&mcnt, &p); - printf ("/dummy_failure_jump to %d", p + mcnt - start); +#ifdef _LIBC + printf ("/dummy_failure_jump to %td", p + mcnt - start); +#else + printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start)); +#endif break; case push_dummy_failure: @@ -786,29 +946,50 @@ print_partial_compiled_pattern (start, end) case maybe_pop_jump: extract_number_and_incr (&mcnt, &p); - printf ("/maybe_pop_jump to %d", p + mcnt - start); +#ifdef _LIBC + printf ("/maybe_pop_jump to %td", p + mcnt - start); +#else + printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start)); +#endif break; case pop_failure_jump: extract_number_and_incr (&mcnt, &p); - printf ("/pop_failure_jump to %d", p + mcnt - start); +#ifdef _LIBC + printf ("/pop_failure_jump to %td", p + mcnt - start); +#else + printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start)); +#endif break; case jump_past_alt: extract_number_and_incr (&mcnt, &p); - printf ("/jump_past_alt to %d", p + mcnt - start); +#ifdef _LIBC + printf ("/jump_past_alt to %td", p + mcnt - start); +#else + printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start)); +#endif break; case jump: extract_number_and_incr (&mcnt, &p); - printf ("/jump to %d", p + mcnt - start); +#ifdef _LIBC + printf ("/jump to %td", p + mcnt - start); +#else + printf ("/jump to %ld", (long int) (p + mcnt - start)); +#endif break; case succeed_n: extract_number_and_incr (&mcnt, &p); p1 = p + mcnt; extract_number_and_incr (&mcnt2, &p); - printf ("/succeed_n to %d, %d times", p1 - start, mcnt2); +#ifdef _LIBC + printf ("/succeed_n to %td, %d times", p1 - start, mcnt2); +#else + printf ("/succeed_n to %ld, %d times", + (long int) (p1 - start), mcnt2); +#endif break; case jump_n: @@ -822,7 +1003,12 @@ print_partial_compiled_pattern (start, end) extract_number_and_incr (&mcnt, &p); p1 = p + mcnt; extract_number_and_incr (&mcnt2, &p); - printf ("/set_number_at location %d to %d", p1 - start, mcnt2); +#ifdef _LIBC + printf ("/set_number_at location %td to %d", p1 - start, mcnt2); +#else + printf ("/set_number_at location %ld to %d", + (long int) (p1 - start), mcnt2); +#endif break; case wordbound: @@ -839,6 +1025,7 @@ print_partial_compiled_pattern (start, end) case wordend: printf ("/wordend"); + break; # ifdef emacs case before_dot: @@ -883,13 +1070,17 @@ print_partial_compiled_pattern (start, end) break; default: - printf ("?%d", *(p-1)); + printf ("?%ld", (long int) *(p-1)); } putchar ('\n'); } - printf ("%d:\tend of pattern.\n", p - start); +#ifdef _LIBC + printf ("%td:\tend of pattern.\n", p - start); +#else + printf ("%ld:\tend of pattern.\n", (long int) (p - start)); +#endif } @@ -897,9 +1088,10 @@ void print_compiled_pattern (bufp) struct re_pattern_buffer *bufp; { - unsigned char *buffer = bufp->buffer; + US_CHAR_TYPE *buffer = (US_CHAR_TYPE*) bufp->buffer; - print_partial_compiled_pattern (buffer, buffer + bufp->used); + print_partial_compiled_pattern (buffer, buffer + + bufp->used / sizeof(US_CHAR_TYPE)); printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used, bufp->allocated); @@ -909,7 +1101,11 @@ print_compiled_pattern (bufp) print_fastmap (bufp->fastmap); } - printf ("re_nsub: %d\t", bufp->re_nsub); +#ifdef _LIBC + printf ("re_nsub: %Zd\t", bufp->re_nsub); +#else + printf ("re_nsub: %ld\t", (long int) bufp->re_nsub); +#endif printf ("regs_alloc: %d\t", bufp->regs_allocated); printf ("can_be_null: %d\t", bufp->can_be_null); printf ("newline_anchor: %d\n", bufp->newline_anchor); @@ -923,9 +1119,9 @@ print_compiled_pattern (bufp) void print_double_string (where, string1, size1, string2, size2) - const char *where; - const char *string1; - const char *string2; + const CHAR_TYPE *where; + const CHAR_TYPE *string1; + const CHAR_TYPE *string2; int size1; int size2; { @@ -938,13 +1134,13 @@ print_double_string (where, string1, size1, string2, size2) if (FIRST_STRING_P (where)) { for (this_char = where - string1; this_char < size1; this_char++) - putchar (string1[this_char]); + PUT_CHAR (string1[this_char]); where = string2; } for (this_char = where - string2; this_char < size2; this_char++) - putchar (string2[this_char]); + PUT_CHAR (string2[this_char]); } } @@ -970,6 +1166,92 @@ printchar (c) #endif /* not DEBUG */ +#ifdef MBS_SUPPORT +/* This convert a multibyte string to a wide character string. + And write their correspondances to offset_buffer(see below) + and write whether each wchar_t is binary data to is_binary. + This assume invalid multibyte sequences as binary data. + We assume offset_buffer and is_binary is already allocated + enough space. */ + +static size_t convert_mbs_to_wcs (CHAR_TYPE *dest, const unsigned char* src, + size_t len, int *offset_buffer, + char *is_binary); +static size_t +convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary) + CHAR_TYPE *dest; + const unsigned char* src; + size_t len; /* the length of multibyte string. */ + + /* It hold correspondances between src(char string) and + dest(wchar_t string) for optimization. + e.g. src = "xxxyzz" + dest = {'X', 'Y', 'Z'} + (each "xxx", "y" and "zz" represent one multibyte character + corresponding to 'X', 'Y' and 'Z'.) + offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")} + = {0, 3, 4, 6} + */ + int *offset_buffer; + char *is_binary; +{ + wchar_t *pdest = dest; + const unsigned char *psrc = src; + size_t wc_count = 0; + + if (MB_CUR_MAX == 1) + { /* We don't need conversion. */ + for ( ; wc_count < len ; ++wc_count) + { + *pdest++ = *psrc++; + is_binary[wc_count] = FALSE; + offset_buffer[wc_count] = wc_count; + } + offset_buffer[wc_count] = wc_count; + } + else + { + /* We need conversion. */ + mbstate_t mbs; + int consumed; + size_t mb_remain = len; + size_t mb_count = 0; + + /* Initialize the conversion state. */ + memset (&mbs, 0, sizeof (mbstate_t)); + + offset_buffer[0] = 0; + for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed, + psrc += consumed) + { + consumed = mbrtowc (pdest, psrc, mb_remain, &mbs); + + if (consumed <= 0) + /* failed to convert. maybe src contains binary data. + So we consume 1 byte manualy. */ + { + *pdest = *psrc; + consumed = 1; + is_binary[wc_count] = TRUE; + } + else + is_binary[wc_count] = FALSE; + /* In sjis encoding, we use yen sign as escape character in + place of reverse solidus. So we convert 0x5c(yen sign in + sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse + solidus in UCS2). */ + if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5) + *pdest = (wchar_t) *psrc; + + offset_buffer[wc_count + 1] = mb_count += consumed; + } + } + + return wc_count; +} + +#endif /* MBS_SUPPORT */ + /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can also be assigned to arbitrarily: each pattern buffer stores its own syntax, so it can be changed between regex compilations. */ @@ -1009,68 +1291,59 @@ weak_alias (__re_set_syntax, re_set_syntax) POSIX doesn't require that we do anything for REG_NOERROR, but why not be nice? */ -#if 0 - /* This section is for xgettext; it sees the strings wrapped inside - N_() and marks them as needing translation. They should match - the strings in re_error_msgid. We can't use the usual string - concatenation trick to initialize re_error_msgid, since other GNU - distributions use this file with traditional C, and traditional C - lacks string concatenation. */ - N_("Success") /* REG_NOERROR */ - N_("No match") /* REG_NOMATCH */ - N_("Invalid regular expression") /* REG_BADPAT */ - N_("Invalid collation character") /* REG_ECOLLATE */ - N_("Invalid character class name") /* REG_ECTYPE */ - N_("Trailing backslash") /* REG_EESCAPE */ - N_("Invalid back reference") /* REG_ESUBREG */ - N_("Unmatched [ or [^") /* REG_EBRACK */ - N_("Unmatched ( or \\(") /* REG_EPAREN */ - N_("Unmatched \\{") /* REG_EBRACE */ - N_("Invalid content of \\{\\}") /* REG_BADBR */ - N_("Invalid range end") /* REG_ERANGE */ - N_("Memory exhausted") /* REG_ESPACE */ - N_("Invalid preceding regular expression") /* REG_BADRPT */ - N_("Premature end of regular expression") /* REG_EEND */ - N_("Regular expression too big") /* REG_ESIZE */ - N_("Unmatched ) or \\)") /* REG_ERPAREN */ -#endif - -static const char re_error_msgid[] = "\ -Success\0\ -No match\0\ -Invalid regular expression\0\ -Invalid collation character\0\ -Invalid character class name\0\ -Trailing backslash\0\ -Invalid back reference\0\ -Unmatched [ or [^\0\ -Unmatched ( or \\(\0\ -Unmatched \\{\0\ -Invalid content of \\{\\}\0\ -Invalid range end\0\ -Memory exhausted\0\ -Invalid preceding regular expression\0\ -Premature end of regular expression\0\ -Regular expression too big\0\ -Unmatched ) or \\)"; - +static const char re_error_msgid[] = + { #define REG_NOERROR_IDX 0 + gettext_noop ("Success") /* REG_NOERROR */ + "\0" #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") + gettext_noop ("No match") /* REG_NOMATCH */ + "\0" #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") + gettext_noop ("Invalid regular expression") /* REG_BADPAT */ + "\0" #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") + gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ + "\0" #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") + gettext_noop ("Invalid character class name") /* REG_ECTYPE */ + "\0" #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") + gettext_noop ("Trailing backslash") /* REG_EESCAPE */ + "\0" #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") + gettext_noop ("Invalid back reference") /* REG_ESUBREG */ + "\0" #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") + gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ + "\0" #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") + gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ + "\0" #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") + gettext_noop ("Unmatched \\{") /* REG_EBRACE */ + "\0" #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") + gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ + "\0" #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") + gettext_noop ("Invalid range end") /* REG_ERANGE */ + "\0" #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") + gettext_noop ("Memory exhausted") /* REG_ESPACE */ + "\0" #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") + gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ + "\0" #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") + gettext_noop ("Premature end of regular expression") /* REG_EEND */ + "\0" #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") + gettext_noop ("Regular expression too big") /* REG_ESIZE */ + "\0" #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") + gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ + }; static const size_t re_error_msgid_idx[] = { @@ -1160,7 +1433,7 @@ long int re_max_failures = 2000; union fail_stack_elt { - unsigned char *pointer; + US_CHAR_TYPE *pointer; long int integer; }; @@ -1178,14 +1451,14 @@ typedef struct # if defined MATCH_MAY_ALLOCATE /* 4400 was enough to cause a crash on Alpha OSF/1, whose default stack limit is 2mb. */ -int re_max_failures = 20000; +int re_max_failures = 4000; # else int re_max_failures = 2000; # endif union fail_stack_elt { - unsigned char *pointer; + US_CHAR_TYPE *pointer; int integer; }; @@ -1267,7 +1540,7 @@ typedef struct Assumes the variable `fail_stack'. Probably should only be called from within `PUSH_FAILURE_POINT'. */ #define PUSH_FAILURE_POINTER(item) \ - fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) + fail_stack.stack[fail_stack.avail++].pointer = (US_CHAR_TYPE *) (item) /* This pushes an integer-valued item onto the failure stack. Assumes the variable `fail_stack'. Probably should only @@ -1424,12 +1697,11 @@ typedef struct Also assumes the variables `fail_stack' and (if debugging), `bufp', `pend', `string1', `size1', `string2', and `size2'. */ - #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ { \ DEBUG_STATEMENT (unsigned failure_id;) \ active_reg_t this_reg; \ - const unsigned char *string_temp; \ + const US_CHAR_TYPE *string_temp; \ \ assert (!FAIL_STACK_EMPTY ()); \ \ @@ -1448,13 +1720,13 @@ typedef struct saved NULL, thus retaining our current position in the string. */ \ string_temp = POP_FAILURE_POINTER (); \ if (string_temp != NULL) \ - str = (const char *) string_temp; \ + str = (const CHAR_TYPE *) string_temp; \ \ DEBUG_PRINT2 (" Popping string %p: `", str); \ DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ DEBUG_PRINT1 ("'\n"); \ \ - pat = (unsigned char *) POP_FAILURE_POINTER (); \ + pat = (US_CHAR_TYPE *) POP_FAILURE_POINTER (); \ DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \ DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ \ @@ -1474,10 +1746,10 @@ typedef struct DEBUG_PRINT2 (" info: %p\n", \ reg_info[this_reg].word.pointer); \ \ - regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \ + regend[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER (); \ DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ \ - regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \ + regstart[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER ();\ DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ } \ else \ @@ -1495,7 +1767,6 @@ typedef struct DEBUG_STATEMENT (nfailure_points_popped++); \ } /* POP_FAILURE_POINT */ - /* Structure for per-register (a.k.a. per-group) information. Other register information, such as the @@ -1553,7 +1824,7 @@ typedef union while (0) /* Registers are set to a sentinel when they haven't yet matched. */ -static char reg_unset_dummy; +static CHAR_TYPE reg_unset_dummy; #define REG_UNSET_VALUE (®_unset_dummy) #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) @@ -1562,40 +1833,65 @@ static char reg_unset_dummy; static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)); -static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg)); -static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, +static void store_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, int arg)); +static void store_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, int arg1, int arg2)); -static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, - int arg, unsigned char *end)); -static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, - int arg1, int arg2, unsigned char *end)); -static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p, +static void insert_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, + int arg, US_CHAR_TYPE *end)); +static void insert_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, + int arg1, int arg2, US_CHAR_TYPE *end)); +static boolean at_begline_loc_p _RE_ARGS ((const CHAR_TYPE *pattern, + const CHAR_TYPE *p, reg_syntax_t syntax)); -static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend, +static boolean at_endline_loc_p _RE_ARGS ((const CHAR_TYPE *p, + const CHAR_TYPE *pend, reg_syntax_t syntax)); -static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, - const char *pend, +#ifdef MBS_SUPPORT +static reg_errcode_t compile_range _RE_ARGS ((CHAR_TYPE range_start, + const CHAR_TYPE **p_ptr, + const CHAR_TYPE *pend, + char *translate, + reg_syntax_t syntax, + US_CHAR_TYPE *b, + CHAR_TYPE *char_set)); +static void insert_space _RE_ARGS ((int num, CHAR_TYPE *loc, CHAR_TYPE *end)); +#else +static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start, + const CHAR_TYPE **p_ptr, + const CHAR_TYPE *pend, char *translate, reg_syntax_t syntax, - unsigned char *b)); + US_CHAR_TYPE *b)); +#endif /* MBS_SUPPORT */ /* Fetch the next character in the uncompiled pattern---translating it if necessary. Also cast from a signed character in the constant string passed to us by the user to an unsigned char that we can use as an array index (in, e.g., `translate'). */ +/* ifdef MBS_SUPPORT, we translate only if character <= 0xff, + because it is impossible to allocate 4GB array for some encodings + which have 4 byte character_set like UCS4. */ #ifndef PATFETCH -# define PATFETCH(c) \ +# ifdef MBS_SUPPORT +# define PATFETCH(c) \ + do {if (p == pend) return REG_EEND; \ + c = (US_CHAR_TYPE) *p++; \ + if (translate && (c <= 0xff)) c = (US_CHAR_TYPE) translate[c]; \ + } while (0) +# else +# define PATFETCH(c) \ do {if (p == pend) return REG_EEND; \ c = (unsigned char) *p++; \ if (translate) c = (unsigned char) translate[c]; \ } while (0) +# endif /* MBS_SUPPORT */ #endif /* Fetch the next character in the uncompiled pattern, with no translation. */ #define PATFETCH_RAW(c) \ do {if (p == pend) return REG_EEND; \ - c = (unsigned char) *p++; \ + c = (US_CHAR_TYPE) *p++; \ } while (0) /* Go backwards one character in the pattern. */ @@ -1606,27 +1902,43 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, cast the subscript to translate because some data is declared as `char *', to avoid warnings when a string constant is passed. But when we use a character as a subscript we must make it unsigned. */ +/* ifdef MBS_SUPPORT, we translate only if character <= 0xff, + because it is impossible to allocate 4GB array for some encodings + which have 4 byte character_set like UCS4. */ #ifndef TRANSLATE -# define TRANSLATE(d) \ +# ifdef MBS_SUPPORT +# define TRANSLATE(d) \ + ((translate && ((US_CHAR_TYPE) (d)) <= 0xff) \ + ? (char) translate[(unsigned char) (d)] : (d)) +#else +# define TRANSLATE(d) \ (translate ? (char) translate[(unsigned char) (d)] : (d)) +# endif /* MBS_SUPPORT */ #endif /* Macros for outputting the compiled pattern into `buffer'. */ /* If the buffer isn't allocated when it comes in, use this. */ -#define INIT_BUF_SIZE 32 +#define INIT_BUF_SIZE (32 * sizeof(US_CHAR_TYPE)) /* Make sure we have at least N more bytes of space in buffer. */ -#define GET_BUFFER_SPACE(n) \ +#ifdef MBS_SUPPORT +# define GET_BUFFER_SPACE(n) \ + while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \ + + (n)*sizeof(CHAR_TYPE)) > bufp->allocated) \ + EXTEND_BUFFER () +#else +# define GET_BUFFER_SPACE(n) \ while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ EXTEND_BUFFER () +#endif /* MBS_SUPPORT */ /* Make sure we have one more byte of buffer space and then add C to it. */ #define BUF_PUSH(c) \ do { \ GET_BUFFER_SPACE (1); \ - *b++ = (unsigned char) (c); \ + *b++ = (US_CHAR_TYPE) (c); \ } while (0) @@ -1634,8 +1946,8 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, #define BUF_PUSH_2(c1, c2) \ do { \ GET_BUFFER_SPACE (2); \ - *b++ = (unsigned char) (c1); \ - *b++ = (unsigned char) (c2); \ + *b++ = (US_CHAR_TYPE) (c1); \ + *b++ = (US_CHAR_TYPE) (c2); \ } while (0) @@ -1643,28 +1955,28 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, #define BUF_PUSH_3(c1, c2, c3) \ do { \ GET_BUFFER_SPACE (3); \ - *b++ = (unsigned char) (c1); \ - *b++ = (unsigned char) (c2); \ - *b++ = (unsigned char) (c3); \ + *b++ = (US_CHAR_TYPE) (c1); \ + *b++ = (US_CHAR_TYPE) (c2); \ + *b++ = (US_CHAR_TYPE) (c3); \ } while (0) - /* Store a jump with opcode OP at LOC to location TO. We store a relative address offset by the three bytes the jump itself occupies. */ #define STORE_JUMP(op, loc, to) \ - store_op1 (op, loc, (int) ((to) - (loc) - 3)) + store_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE))) /* Likewise, for a two-argument jump. */ #define STORE_JUMP2(op, loc, to, arg) \ - store_op2 (op, loc, (int) ((to) - (loc) - 3), arg) + store_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg) /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ #define INSERT_JUMP(op, loc, to) \ - insert_op1 (op, loc, (int) ((to) - (loc) - 3), b) + insert_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b) /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ #define INSERT_JUMP2(op, loc, to, arg) \ - insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b) + insert_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\ + arg, b) /* This is not an arbitrary limit: the arguments which represent offsets @@ -1690,31 +2002,90 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, reset the pointers that pointed into the old block to point to the correct places in the new one. If extending the buffer results in it being larger than MAX_BUF_SIZE, then flag memory exhausted. */ -#define EXTEND_BUFFER() \ - do { \ - unsigned char *old_buffer = bufp->buffer; \ - if (bufp->allocated == MAX_BUF_SIZE) \ +#if __BOUNDED_POINTERS__ +# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated) +# define MOVE_BUFFER_POINTER(P) \ + (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr) +# define ELSE_EXTEND_BUFFER_HIGH_BOUND \ + else \ + { \ + SET_HIGH_BOUND (b); \ + SET_HIGH_BOUND (begalt); \ + if (fixup_alt_jump) \ + SET_HIGH_BOUND (fixup_alt_jump); \ + if (laststart) \ + SET_HIGH_BOUND (laststart); \ + if (pending_exact) \ + SET_HIGH_BOUND (pending_exact); \ + } +#else +# define MOVE_BUFFER_POINTER(P) (P) += incr +# define ELSE_EXTEND_BUFFER_HIGH_BOUND +#endif + +#ifdef MBS_SUPPORT +# define EXTEND_BUFFER() \ + do { \ + US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \ + int wchar_count; \ + if (bufp->allocated + sizeof(US_CHAR_TYPE) > MAX_BUF_SIZE) \ return REG_ESIZE; \ bufp->allocated <<= 1; \ if (bufp->allocated > MAX_BUF_SIZE) \ - bufp->allocated = MAX_BUF_SIZE; \ - bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\ - if (bufp->buffer == NULL) \ + bufp->allocated = MAX_BUF_SIZE; \ + /* How many characters the new buffer can have? */ \ + wchar_count = bufp->allocated / sizeof(US_CHAR_TYPE); \ + if (wchar_count == 0) wchar_count = 1; \ + /* Truncate the buffer to CHAR_TYPE align. */ \ + bufp->allocated = wchar_count * sizeof(US_CHAR_TYPE); \ + RETALLOC (COMPILED_BUFFER_VAR, wchar_count, US_CHAR_TYPE); \ + bufp->buffer = (char*)COMPILED_BUFFER_VAR; \ + if (COMPILED_BUFFER_VAR == NULL) \ return REG_ESPACE; \ /* If the buffer moved, move all the pointers into it. */ \ - if (old_buffer != bufp->buffer) \ + if (old_buffer != COMPILED_BUFFER_VAR) \ { \ - b = (b - old_buffer) + bufp->buffer; \ - begalt = (begalt - old_buffer) + bufp->buffer; \ - if (fixup_alt_jump) \ - fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ - if (laststart) \ - laststart = (laststart - old_buffer) + bufp->buffer; \ - if (pending_exact) \ - pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ + int incr = COMPILED_BUFFER_VAR - old_buffer; \ + MOVE_BUFFER_POINTER (b); \ + MOVE_BUFFER_POINTER (begalt); \ + if (fixup_alt_jump) \ + MOVE_BUFFER_POINTER (fixup_alt_jump); \ + if (laststart) \ + MOVE_BUFFER_POINTER (laststart); \ + if (pending_exact) \ + MOVE_BUFFER_POINTER (pending_exact); \ } \ + ELSE_EXTEND_BUFFER_HIGH_BOUND \ } while (0) - +#else +# define EXTEND_BUFFER() \ + do { \ + US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \ + if (bufp->allocated == MAX_BUF_SIZE) \ + return REG_ESIZE; \ + bufp->allocated <<= 1; \ + if (bufp->allocated > MAX_BUF_SIZE) \ + bufp->allocated = MAX_BUF_SIZE; \ + bufp->buffer = (US_CHAR_TYPE *) REALLOC (COMPILED_BUFFER_VAR, \ + bufp->allocated); \ + if (COMPILED_BUFFER_VAR == NULL) \ + return REG_ESPACE; \ + /* If the buffer moved, move all the pointers into it. */ \ + if (old_buffer != COMPILED_BUFFER_VAR) \ + { \ + int incr = COMPILED_BUFFER_VAR - old_buffer; \ + MOVE_BUFFER_POINTER (b); \ + MOVE_BUFFER_POINTER (begalt); \ + if (fixup_alt_jump) \ + MOVE_BUFFER_POINTER (fixup_alt_jump); \ + if (laststart) \ + MOVE_BUFFER_POINTER (laststart); \ + if (pending_exact) \ + MOVE_BUFFER_POINTER (pending_exact); \ + } \ + ELSE_EXTEND_BUFFER_HIGH_BOUND \ + } while (0) +#endif /* MBS_SUPPORT */ /* Since we have one byte reserved for the register number argument to {start,stop}_memory, the maximum number of groups we can report @@ -1768,20 +2139,20 @@ typedef struct /* Get the next unsigned number in the uncompiled pattern. */ #define GET_UNSIGNED_NUMBER(num) \ - { if (p != pend) \ - { \ - PATFETCH (c); \ - while ('0' <= c && c <= '9') \ - { \ - if (num < 0) \ - num = 0; \ - num = num * 10 + c - '0'; \ - if (p == pend) \ - break; \ - PATFETCH (c); \ - } \ - } \ - } + { \ + while (p != pend) \ + { \ + PATFETCH (c); \ + if (! ('0' <= c && c <= '9')) \ + break; \ + if (num <= RE_DUP_MAX) \ + { \ + if (num < 0) \ + num = 0; \ + num = num * 10 + c - '0'; \ + } \ + } \ + } #if defined _LIBC || WIDE_CHAR_SUPPORT /* The GNU C library provides support for user-defined character classes @@ -1882,33 +2253,61 @@ static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type examined nor set. */ /* Return, freeing storage we allocated. */ -#define FREE_STACK_RETURN(value) \ +#ifdef MBS_SUPPORT +# define FREE_STACK_RETURN(value) \ + return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value) +#else +# define FREE_STACK_RETURN(value) \ return (free (compile_stack.stack), value) +#endif /* MBS_SUPPORT */ static reg_errcode_t +#ifdef MBS_SUPPORT +regex_compile (cpattern, csize, syntax, bufp) + const char *cpattern; + size_t csize; +#else regex_compile (pattern, size, syntax, bufp) const char *pattern; size_t size; +#endif /* MBS_SUPPORT */ reg_syntax_t syntax; struct re_pattern_buffer *bufp; { /* We fetch characters from PATTERN here. Even though PATTERN is `char *' (i.e., signed), we declare these variables as unsigned, so they can be reliably used as array indices. */ - register unsigned char c, c1; + register US_CHAR_TYPE c, c1; + +#ifdef MBS_SUPPORT + /* A temporary space to keep wchar_t pattern and compiled pattern. */ + CHAR_TYPE *pattern, *COMPILED_BUFFER_VAR; + size_t size; + /* offset buffer for optimizatoin. See convert_mbs_to_wc. */ + int *mbs_offset = NULL; + /* It hold whether each wchar_t is binary data or not. */ + char *is_binary = NULL; + /* A flag whether exactn is handling binary data or not. */ + char is_exactn_bin = FALSE; +#endif /* MBS_SUPPORT */ /* A random temporary spot in PATTERN. */ - const char *p1; + const CHAR_TYPE *p1; /* Points to the end of the buffer, where we should append. */ - register unsigned char *b; + register US_CHAR_TYPE *b; /* Keeps track of unclosed groups. */ compile_stack_type compile_stack; /* Points to the current (ending) position in the pattern. */ - const char *p = pattern; - const char *pend = pattern + size; +#ifdef MBS_SUPPORT + const CHAR_TYPE *p; + const CHAR_TYPE *pend; +#else + const CHAR_TYPE *p = pattern; + const CHAR_TYPE *pend = pattern + size; +#endif /* MBS_SUPPORT */ /* How to translate the characters in the pattern. */ RE_TRANSLATE_TYPE translate = bufp->translate; @@ -1917,30 +2316,50 @@ regex_compile (pattern, size, syntax, bufp) command. This makes it possible to tell if a new exact-match character can be added to that command or if the character requires a new `exactn' command. */ - unsigned char *pending_exact = 0; + US_CHAR_TYPE *pending_exact = 0; /* Address of start of the most recently finished expression. This tells, e.g., postfix * where to find the start of its operand. Reset at the beginning of groups and alternatives. */ - unsigned char *laststart = 0; + US_CHAR_TYPE *laststart = 0; /* Address of beginning of regexp, or inside of last group. */ - unsigned char *begalt; - - /* Place in the uncompiled pattern (i.e., the {) to - which to go back if the interval is invalid. */ - const char *beg_interval; + US_CHAR_TYPE *begalt; /* Address of the place where a forward jump should go to the end of the containing expression. Each alternative of an `or' -- except the last -- ends with a forward jump of this sort. */ - unsigned char *fixup_alt_jump = 0; + US_CHAR_TYPE *fixup_alt_jump = 0; /* Counts open-groups as they are encountered. Remembered for the matching close-group on the compile stack, so the same register number is put in the stop_memory as the start_memory. */ regnum_t regnum = 0; +#ifdef MBS_SUPPORT + /* Initialize the wchar_t PATTERN and offset_buffer. */ + p = pend = pattern = TALLOC(csize + 1, CHAR_TYPE); + p[csize] = L'\0'; /* sentinel */ + mbs_offset = TALLOC(csize + 1, int); + is_binary = TALLOC(csize + 1, char); + if (pattern == NULL || mbs_offset == NULL || is_binary == NULL) + { + if (pattern) free(pattern); + if (mbs_offset) free(mbs_offset); + if (is_binary) free(is_binary); + return REG_ESPACE; + } + size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary); + pend = p + size; + if (size < 0) + { + if (pattern) free(pattern); + if (mbs_offset) free(mbs_offset); + if (is_binary) free(is_binary); + return REG_BADPAT; + } +#endif + #ifdef DEBUG DEBUG_PRINT1 ("\nCompiling pattern: "); if (debug) @@ -1948,7 +2367,7 @@ regex_compile (pattern, size, syntax, bufp) unsigned debug_count; for (debug_count = 0; debug_count < size; debug_count++) - putchar (pattern[debug_count]); + PUT_CHAR (pattern[debug_count]); putchar ('\n'); } #endif /* DEBUG */ @@ -1956,7 +2375,14 @@ regex_compile (pattern, size, syntax, bufp) /* Initialize the compile stack. */ compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); if (compile_stack.stack == NULL) - return REG_ESPACE; + { +#ifdef MBS_SUPPORT + if (pattern) free(pattern); + if (mbs_offset) free(mbs_offset); + if (is_binary) free(is_binary); +#endif + return REG_ESPACE; + } compile_stack.size = INIT_COMPILE_STACK_SIZE; compile_stack.avail = 0; @@ -1985,18 +2411,34 @@ regex_compile (pattern, size, syntax, bufp) { /* If zero allocated, but buffer is non-null, try to realloc enough space. This loses if buffer's address is bogus, but that is the user's responsibility. */ - RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); +#ifdef MBS_SUPPORT + /* Free bufp->buffer and allocate an array for wchar_t pattern + buffer. */ + free(bufp->buffer); + COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(US_CHAR_TYPE), + US_CHAR_TYPE); +#else + RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, US_CHAR_TYPE); +#endif /* MBS_SUPPORT */ } else { /* Caller did not allocate a buffer. Do it for them. */ - bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); + COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(US_CHAR_TYPE), + US_CHAR_TYPE); } - if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); + if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE); +#ifdef MBS_SUPPORT + bufp->buffer = (char*)COMPILED_BUFFER_VAR; +#endif /* MBS_SUPPORT */ bufp->allocated = INIT_BUF_SIZE; } +#ifdef MBS_SUPPORT + else + COMPILED_BUFFER_VAR = (US_CHAR_TYPE*) bufp->buffer; +#endif - begalt = b = bufp->buffer; + begalt = b = COMPILED_BUFFER_VAR; /* Loop through the uncompiled pattern until we're at the end. */ while (p != pend) @@ -2121,7 +2563,7 @@ regex_compile (pattern, size, syntax, bufp) assert (p - 1 > pattern); /* Allocate the space for the jump. */ - GET_BUFFER_SPACE (3); + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); /* We know we are not at the first character of the pattern, because laststart was nonzero. And we've already @@ -2138,20 +2580,23 @@ regex_compile (pattern, size, syntax, bufp) } else /* Anything else. */ - STORE_JUMP (maybe_pop_jump, b, laststart - 3); + STORE_JUMP (maybe_pop_jump, b, laststart - + (1 + OFFSET_ADDRESS_SIZE)); /* We've added more stuff to the buffer. */ - b += 3; + b += 1 + OFFSET_ADDRESS_SIZE; } /* On failure, jump from laststart to b + 3, which will be the end of the buffer after this jump is inserted. */ - GET_BUFFER_SPACE (3); + /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of + 'b + 3'. */ + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump : on_failure_jump, - laststart, b + 3); + laststart, b + 1 + OFFSET_ADDRESS_SIZE); pending_exact = 0; - b += 3; + b += 1 + OFFSET_ADDRESS_SIZE; if (!zero_times_ok) { @@ -2160,9 +2605,10 @@ regex_compile (pattern, size, syntax, bufp) `on_failure_jump' instruction of the loop. This effects a skip over that instruction the first time we hit that loop. */ - GET_BUFFER_SPACE (3); - INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); - b += 3; + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); + INSERT_JUMP (dummy_failure_jump, laststart, laststart + + 2 + 2 * OFFSET_ADDRESS_SIZE); + b += 1 + OFFSET_ADDRESS_SIZE; } } break; @@ -2177,9 +2623,476 @@ regex_compile (pattern, size, syntax, bufp) case '[': { boolean had_char_class = false; - +#ifdef MBS_SUPPORT + CHAR_TYPE range_start = 0xffffffff; +#else + unsigned int range_start = 0xffffffff; +#endif if (p == pend) FREE_STACK_RETURN (REG_EBRACK); +#ifdef MBS_SUPPORT + /* We assume a charset(_not) structure as a wchar_t array. + charset[0] = (re_opcode_t) charset(_not) + charset[1] = l (= length of char_classes) + charset[2] = m (= length of collating_symbols) + charset[3] = n (= length of equivalence_classes) + charset[4] = o (= length of char_ranges) + charset[5] = p (= length of chars) + + charset[6] = char_class (wctype_t) + charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t) + ... + charset[l+5] = char_class (wctype_t) + + charset[l+6] = collating_symbol (wchar_t) + ... + charset[l+m+5] = collating_symbol (wchar_t) + ifdef _LIBC we use the index if + _NL_COLLATE_SYMB_EXTRAMB instead of + wchar_t string. + + charset[l+m+6] = equivalence_classes (wchar_t) + ... + charset[l+m+n+5] = equivalence_classes (wchar_t) + ifdef _LIBC we use the index in + _NL_COLLATE_WEIGHT instead of + wchar_t string. + + charset[l+m+n+6] = range_start + charset[l+m+n+7] = range_end + ... + charset[l+m+n+2o+4] = range_start + charset[l+m+n+2o+5] = range_end + ifdef _LIBC we use the value looked up + in _NL_COLLATE_COLLSEQ instead of + wchar_t character. + + charset[l+m+n+2o+6] = char + ... + charset[l+m+n+2o+p+5] = char + + */ + + /* We need at least 6 spaces: the opcode, the length of + char_classes, the length of collating_symbols, the length of + equivalence_classes, the length of char_ranges, the length of + chars. */ + GET_BUFFER_SPACE (6); + + /* Save b as laststart. And We use laststart as the pointer + to the first element of the charset here. + In other words, laststart[i] indicates charset[i]. */ + laststart = b; + + /* We test `*p == '^' twice, instead of using an if + statement, so we only need one BUF_PUSH. */ + BUF_PUSH (*p == '^' ? charset_not : charset); + if (*p == '^') + p++; + + /* Push the length of char_classes, the length of + collating_symbols, the length of equivalence_classes, the + length of char_ranges and the length of chars. */ + BUF_PUSH_3 (0, 0, 0); + BUF_PUSH_2 (0, 0); + + /* Remember the first position in the bracket expression. */ + p1 = p; + + /* charset_not matches newline according to a syntax bit. */ + if ((re_opcode_t) b[-6] == charset_not + && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) + { + BUF_PUSH('\n'); + laststart[5]++; /* Update the length of characters */ + } + + /* Read in characters and ranges, setting map bits. */ + for (;;) + { + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + PATFETCH (c); + + /* \ might escape characters inside [...] and [^...]. */ + if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') + { + if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); + + PATFETCH (c1); + BUF_PUSH(c1); + laststart[5]++; /* Update the length of chars */ + range_start = c1; + continue; + } + + /* Could be the end of the bracket expression. If it's + not (i.e., when the bracket expression is `[]' so + far), the ']' character bit gets set way below. */ + if (c == ']' && p != p1 + 1) + break; + + /* Look ahead to see if it's a range when the last thing + was a character class. */ + if (had_char_class && c == '-' && *p != ']') + FREE_STACK_RETURN (REG_ERANGE); + + /* Look ahead to see if it's a range when the last thing + was a character: if this is a hyphen not at the + beginning or the end of a list, then it's the range + operator. */ + if (c == '-' + && !(p - 2 >= pattern && p[-2] == '[') + && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') + && *p != ']') + { + reg_errcode_t ret; + /* Allocate the space for range_start and range_end. */ + GET_BUFFER_SPACE (2); + /* Update the pointer to indicate end of buffer. */ + b += 2; + ret = compile_range (range_start, &p, pend, translate, + syntax, b, laststart); + if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + range_start = 0xffffffff; + } + else if (p[0] == '-' && p[1] != ']') + { /* This handles ranges made up of characters only. */ + reg_errcode_t ret; + + /* Move past the `-'. */ + PATFETCH (c1); + /* Allocate the space for range_start and range_end. */ + GET_BUFFER_SPACE (2); + /* Update the pointer to indicate end of buffer. */ + b += 2; + ret = compile_range (c, &p, pend, translate, syntax, b, + laststart); + if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + range_start = 0xffffffff; + } + + /* See if we're at the beginning of a possible character + class. */ + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') + { /* Leave room for the null. */ + char str[CHAR_CLASS_MAX_LENGTH + 1]; + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[:'. */ + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + for (;;) + { + PATFETCH (c); + if ((c == ':' && *p == ']') || p == pend) + break; + if (c1 < CHAR_CLASS_MAX_LENGTH) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[c1] = '\0'; + + /* If isn't a word bracketed by `[:' and `:]': + undo the ending character, the letters, and leave + the leading `:' and `[' (but store them as character). */ + if (c == ':' && *p == ']') + { + wctype_t wt; + uintptr_t alignedp; + + /* Query the character class as wctype_t. */ + wt = IS_CHAR_CLASS (str); + if (wt == 0) + FREE_STACK_RETURN (REG_ECTYPE); + + /* Throw away the ] at the end of the character + class. */ + PATFETCH (c); + + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + /* Allocate the space for character class. */ + GET_BUFFER_SPACE(CHAR_CLASS_SIZE); + /* Update the pointer to indicate end of buffer. */ + b += CHAR_CLASS_SIZE; + /* Move data which follow character classes + not to violate the data. */ + insert_space(CHAR_CLASS_SIZE, + laststart + 6 + laststart[1], + b - 1); + alignedp = ((uintptr_t)(laststart + 6 + laststart[1]) + + __alignof__(wctype_t) - 1) + & ~(uintptr_t)(__alignof__(wctype_t) - 1); + /* Store the character class. */ + *((wctype_t*)alignedp) = wt; + /* Update length of char_classes */ + laststart[1] += CHAR_CLASS_SIZE; + + had_char_class = true; + } + else + { + c1++; + while (c1--) + PATUNFETCH; + BUF_PUSH ('['); + BUF_PUSH (':'); + laststart[5] += 2; /* Update the length of characters */ + range_start = ':'; + had_char_class = false; + } + } + else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '=' + || *p == '.')) + { + CHAR_TYPE str[128]; /* Should be large enough. */ + CHAR_TYPE delim = *p; /* '=' or '.' */ +# ifdef _LIBC + uint32_t nrules = + _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); +# endif + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[=' or '[[.'. */ + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + for (;;) + { + PATFETCH (c); + if ((c == delim && *p == ']') || p == pend) + break; + if (c1 < sizeof (str) - 1) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[c1] = '\0'; + + if (c == delim && *p == ']' && str[0] != '\0') + { + unsigned int i, offset; + /* If we have no collation data we use the default + collation in which each character is in a class + by itself. It also means that ASCII is the + character set and therefore we cannot have character + with more than one byte in the multibyte + representation. */ + + /* If not defined _LIBC, we push the name and + `\0' for the sake of matching performance. */ + int datasize = c1 + 1; + +# ifdef _LIBC + int32_t idx = 0; + if (nrules == 0) +# endif + { + if (c1 != 1) + FREE_STACK_RETURN (REG_ECOLLATE); + } +# ifdef _LIBC + else + { + const int32_t *table; + const int32_t *weights; + const int32_t *extra; + const int32_t *indirect; + wint_t *cp; + + /* This #include defines a local function! */ +# include <locale/weightwc.h> + + if(delim == '=') + { + /* We push the index for equivalence class. */ + cp = (wint_t*)str; + + table = (const int32_t *) + _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_TABLEWC); + weights = (const int32_t *) + _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_WEIGHTWC); + extra = (const int32_t *) + _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_EXTRAWC); + indirect = (const int32_t *) + _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_INDIRECTWC); + + idx = findidx ((const wint_t**)&cp); + if (idx == 0 || cp < (wint_t*) str + c1) + /* This is no valid character. */ + FREE_STACK_RETURN (REG_ECOLLATE); + + str[0] = (wchar_t)idx; + } + else /* delim == '.' */ + { + /* We push collation sequence value + for collating symbol. */ + int32_t table_size; + const int32_t *symb_table; + const unsigned char *extra; + int32_t idx; + int32_t elem; + int32_t second; + int32_t hash; + char char_str[c1]; + + /* We have to convert the name to a single-byte + string. This is possible since the names + consist of ASCII characters and the internal + representation is UCS4. */ + for (i = 0; i < c1; ++i) + char_str[i] = str[i]; + + table_size = + _NL_CURRENT_WORD (LC_COLLATE, + _NL_COLLATE_SYMB_HASH_SIZEMB); + symb_table = (const int32_t *) + _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_SYMB_TABLEMB); + extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_SYMB_EXTRAMB); + + /* Locate the character in the hashing table. */ + hash = elem_hash (char_str, c1); + + idx = 0; + elem = hash % table_size; + second = hash % (table_size - 2); + while (symb_table[2 * elem] != 0) + { + /* First compare the hashing value. */ + if (symb_table[2 * elem] == hash + && c1 == extra[symb_table[2 * elem + 1]] + && memcmp (str, + &extra[symb_table[2 * elem + 1] + + 1], c1) == 0) + { + /* Yep, this is the entry. */ + idx = symb_table[2 * elem + 1]; + idx += 1 + extra[idx]; + break; + } + + /* Next entry. */ + elem += second; + } + + if (symb_table[2 * elem] != 0) + { + /* Compute the index of the byte sequence + in the table. */ + idx += 1 + extra[idx]; + /* Adjust for the alignment. */ + idx = (idx + 3) & ~4; + + str[0] = (wchar_t) idx + 4; + } + else if (symb_table[2 * elem] == 0 && c1 == 1) + { + /* No valid character. Match it as a + single byte character. */ + had_char_class = false; + BUF_PUSH(str[0]); + /* Update the length of characters */ + laststart[5]++; + range_start = str[0]; + + /* Throw away the ] at the end of the + collating symbol. */ + PATFETCH (c); + /* exit from the switch block. */ + continue; + } + else + FREE_STACK_RETURN (REG_ECOLLATE); + } + datasize = 1; + } +# endif + /* Throw away the ] at the end of the equivalence + class (or collating symbol). */ + PATFETCH (c); + + /* Allocate the space for the equivalence class + (or collating symbol) (and '\0' if needed). */ + GET_BUFFER_SPACE(datasize); + /* Update the pointer to indicate end of buffer. */ + b += datasize; + + if (delim == '=') + { /* equivalence class */ + /* Calculate the offset of char_ranges, + which is next to equivalence_classes. */ + offset = laststart[1] + laststart[2] + + laststart[3] +6; + /* Insert space. */ + insert_space(datasize, laststart + offset, b - 1); + + /* Write the equivalence_class and \0. */ + for (i = 0 ; i < datasize ; i++) + laststart[offset + i] = str[i]; + + /* Update the length of equivalence_classes. */ + laststart[3] += datasize; + had_char_class = true; + } + else /* delim == '.' */ + { /* collating symbol */ + /* Calculate the offset of the equivalence_classes, + which is next to collating_symbols. */ + offset = laststart[1] + laststart[2] + 6; + /* Insert space and write the collationg_symbol + and \0. */ + insert_space(datasize, laststart + offset, b-1); + for (i = 0 ; i < datasize ; i++) + laststart[offset + i] = str[i]; + + /* In re_match_2_internal if range_start < -1, we + assume -range_start is the offset of the + collating symbol which is specified as + the character of the range start. So we assign + -(laststart[1] + laststart[2] + 6) to + range_start. */ + range_start = -(laststart[1] + laststart[2] + 6); + /* Update the length of collating_symbol. */ + laststart[2] += datasize; + had_char_class = false; + } + } + else + { + c1++; + while (c1--) + PATUNFETCH; + BUF_PUSH ('['); + BUF_PUSH (delim); + laststart[5] += 2; /* Update the length of characters */ + range_start = delim; + had_char_class = false; + } + } + else + { + had_char_class = false; + BUF_PUSH(c); + laststart[5]++; /* Update the length of characters */ + range_start = c; + } + } + +#else /* not MBS_SUPPORT */ /* Ensure that we have enough space to push a charset: the opcode, the length count, and the bitset; 34 bytes in all. */ GET_BUFFER_SPACE (34); @@ -2220,6 +3133,7 @@ regex_compile (pattern, size, syntax, bufp) PATFETCH (c1); SET_LIST_BIT (c1); + range_start = c1; continue; } @@ -2244,8 +3158,10 @@ regex_compile (pattern, size, syntax, bufp) && *p != ']') { reg_errcode_t ret - = compile_range (&p, pend, translate, syntax, b); + = compile_range (range_start, &p, pend, translate, + syntax, b); if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + range_start = 0xffffffff; } else if (p[0] == '-' && p[1] != ']') @@ -2255,8 +3171,9 @@ regex_compile (pattern, size, syntax, bufp) /* Move past the `-'. */ PATFETCH (c1); - ret = compile_range (&p, pend, translate, syntax, b); + ret = compile_range (c, &p, pend, translate, syntax, b); if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + range_start = 0xffffffff; } /* See if we're at the beginning of a possible character @@ -2290,7 +3207,7 @@ regex_compile (pattern, size, syntax, bufp) the leading `:' and `[' (but set bits for them). */ if (c == ':' && *p == ']') { -#if defined _LIBC || WIDE_CHAR_SUPPORT +# if defined _LIBC || WIDE_CHAR_SUPPORT boolean is_lower = STREQ (str, "lower"); boolean is_upper = STREQ (str, "upper"); wctype_t wt; @@ -2308,13 +3225,13 @@ regex_compile (pattern, size, syntax, bufp) for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) { -# ifdef _LIBC +# ifdef _LIBC if (__iswctype (__btowc (ch), wt)) SET_LIST_BIT (ch); -# else +# else if (iswctype (btowc (ch), wt)) SET_LIST_BIT (ch); -# endif +# endif if (translate && (is_upper || is_lower) && (ISUPPER (ch) || ISLOWER (ch))) @@ -2322,7 +3239,7 @@ regex_compile (pattern, size, syntax, bufp) } had_char_class = true; -#else +# else int ch; boolean is_alnum = STREQ (str, "alnum"); boolean is_alpha = STREQ (str, "alpha"); @@ -2370,7 +3287,7 @@ regex_compile (pattern, size, syntax, bufp) SET_LIST_BIT (ch); } had_char_class = true; -#endif /* libc || wctype.h */ +# endif /* libc || wctype.h */ } else { @@ -2379,13 +3296,292 @@ regex_compile (pattern, size, syntax, bufp) PATUNFETCH; SET_LIST_BIT ('['); SET_LIST_BIT (':'); + range_start = ':'; had_char_class = false; } } + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=') + { + unsigned char str[MB_LEN_MAX + 1]; +# ifdef _LIBC + uint32_t nrules = + _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); +# endif + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[='. */ + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + for (;;) + { + PATFETCH (c); + if ((c == '=' && *p == ']') || p == pend) + break; + if (c1 < MB_LEN_MAX) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[c1] = '\0'; + + if (c == '=' && *p == ']' && str[0] != '\0') + { + /* If we have no collation data we use the default + collation in which each character is in a class + by itself. It also means that ASCII is the + character set and therefore we cannot have character + with more than one byte in the multibyte + representation. */ +# ifdef _LIBC + if (nrules == 0) +# endif + { + if (c1 != 1) + FREE_STACK_RETURN (REG_ECOLLATE); + + /* Throw away the ] at the end of the equivalence + class. */ + PATFETCH (c); + + /* Set the bit for the character. */ + SET_LIST_BIT (str[0]); + } +# ifdef _LIBC + else + { + /* Try to match the byte sequence in `str' against + those known to the collate implementation. + First find out whether the bytes in `str' are + actually from exactly one character. */ + const int32_t *table; + const unsigned char *weights; + const unsigned char *extra; + const int32_t *indirect; + int32_t idx; + const unsigned char *cp = str; + int ch; + + /* This #include defines a local function! */ +# include <locale/weight.h> + + table = (const int32_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); + weights = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); + extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); + indirect = (const int32_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); + + idx = findidx (&cp); + if (idx == 0 || cp < str + c1) + /* This is no valid character. */ + FREE_STACK_RETURN (REG_ECOLLATE); + + /* Throw away the ] at the end of the equivalence + class. */ + PATFETCH (c); + + /* Now we have to go throught the whole table + and find all characters which have the same + first level weight. + + XXX Note that this is not entirely correct. + we would have to match multibyte sequences + but this is not possible with the current + implementation. */ + for (ch = 1; ch < 256; ++ch) + /* XXX This test would have to be changed if we + would allow matching multibyte sequences. */ + if (table[ch] > 0) + { + int32_t idx2 = table[ch]; + size_t len = weights[idx2]; + + /* Test whether the lenghts match. */ + if (weights[idx] == len) + { + /* They do. New compare the bytes of + the weight. */ + size_t cnt = 0; + + while (cnt < len + && (weights[idx + 1 + cnt] + == weights[idx2 + 1 + cnt])) + ++cnt; + + if (cnt == len) + /* They match. Mark the character as + acceptable. */ + SET_LIST_BIT (ch); + } + } + } +# endif + had_char_class = true; + } + else + { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT ('['); + SET_LIST_BIT ('='); + range_start = '='; + had_char_class = false; + } + } + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.') + { + unsigned char str[128]; /* Should be large enough. */ +# ifdef _LIBC + uint32_t nrules = + _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); +# endif + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[.'. */ + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + for (;;) + { + PATFETCH (c); + if ((c == '.' && *p == ']') || p == pend) + break; + if (c1 < sizeof (str)) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[c1] = '\0'; + + if (c == '.' && *p == ']' && str[0] != '\0') + { + /* If we have no collation data we use the default + collation in which each character is the name + for its own class which contains only the one + character. It also means that ASCII is the + character set and therefore we cannot have character + with more than one byte in the multibyte + representation. */ +# ifdef _LIBC + if (nrules == 0) +# endif + { + if (c1 != 1) + FREE_STACK_RETURN (REG_ECOLLATE); + + /* Throw away the ] at the end of the equivalence + class. */ + PATFETCH (c); + + /* Set the bit for the character. */ + SET_LIST_BIT (str[0]); + range_start = ((const unsigned char *) str)[0]; + } +# ifdef _LIBC + else + { + /* Try to match the byte sequence in `str' against + those known to the collate implementation. + First find out whether the bytes in `str' are + actually from exactly one character. */ + int32_t table_size; + const int32_t *symb_table; + const unsigned char *extra; + int32_t idx; + int32_t elem; + int32_t second; + int32_t hash; + + table_size = + _NL_CURRENT_WORD (LC_COLLATE, + _NL_COLLATE_SYMB_HASH_SIZEMB); + symb_table = (const int32_t *) + _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_SYMB_TABLEMB); + extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_SYMB_EXTRAMB); + + /* Locate the character in the hashing table. */ + hash = elem_hash (str, c1); + + idx = 0; + elem = hash % table_size; + second = hash % (table_size - 2); + while (symb_table[2 * elem] != 0) + { + /* First compare the hashing value. */ + if (symb_table[2 * elem] == hash + && c1 == extra[symb_table[2 * elem + 1]] + && memcmp (str, + &extra[symb_table[2 * elem + 1] + + 1], + c1) == 0) + { + /* Yep, this is the entry. */ + idx = symb_table[2 * elem + 1]; + idx += 1 + extra[idx]; + break; + } + + /* Next entry. */ + elem += second; + } + + if (symb_table[2 * elem] == 0) + /* This is no valid character. */ + FREE_STACK_RETURN (REG_ECOLLATE); + + /* Throw away the ] at the end of the equivalence + class. */ + PATFETCH (c); + + /* Now add the multibyte character(s) we found + to the accept list. + + XXX Note that this is not entirely correct. + we would have to match multibyte sequences + but this is not possible with the current + implementation. Also, we have to match + collating symbols, which expand to more than + one file, as a whole and not allow the + individual bytes. */ + c1 = extra[idx++]; + if (c1 == 1) + range_start = extra[idx]; + while (c1-- > 0) + { + SET_LIST_BIT (extra[idx]); + ++idx; + } + } +# endif + had_char_class = false; + } + else + { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT ('['); + SET_LIST_BIT ('.'); + range_start = '.'; + had_char_class = false; + } + } else { had_char_class = false; SET_LIST_BIT (c); + range_start = c; } } @@ -2394,6 +3590,7 @@ regex_compile (pattern, size, syntax, bufp) while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) b[-1]--; b += b[-1]; +#endif /* MBS_SUPPORT */ } break; @@ -2464,10 +3661,10 @@ regex_compile (pattern, size, syntax, bufp) group. They are all relative offsets, so that if the whole pattern moves because of realloc, they will still be valid. */ - COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; + COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR; COMPILE_STACK_TOP.fixup_alt_jump - = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; - COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; + = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0; + COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR; COMPILE_STACK_TOP.regnum = regnum; /* We will eventually replace the 0 with the number of @@ -2476,7 +3673,8 @@ regex_compile (pattern, size, syntax, bufp) represent in the compiled pattern. */ if (regnum <= MAX_REGNUM) { - COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; + COMPILE_STACK_TOP.inner_group_offset = b + - COMPILED_BUFFER_VAR + 2; BUF_PUSH_3 (start_memory, regnum, 0); } @@ -2535,12 +3733,12 @@ regex_compile (pattern, size, syntax, bufp) regnum_t this_group_regnum; compile_stack.avail--; - begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; + begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset; fixup_alt_jump = COMPILE_STACK_TOP.fixup_alt_jump - ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 + ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1 : 0; - laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; + laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset; this_group_regnum = COMPILE_STACK_TOP.regnum; /* If we've reached MAX_REGNUM groups, then this open won't actually generate any code, so we'll have to @@ -2551,8 +3749,8 @@ regex_compile (pattern, size, syntax, bufp) groups were inside this one. */ if (this_group_regnum <= MAX_REGNUM) { - unsigned char *inner_group_loc - = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; + US_CHAR_TYPE *inner_group_loc + = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset; *inner_group_loc = regnum - this_group_regnum; BUF_PUSH_3 (stop_memory, this_group_regnum, @@ -2571,10 +3769,11 @@ regex_compile (pattern, size, syntax, bufp) /* Insert before the previous alternative a jump which jumps to this alternative if the former fails. */ - GET_BUFFER_SPACE (3); - INSERT_JUMP (on_failure_jump, begalt, b + 6); + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); + INSERT_JUMP (on_failure_jump, begalt, + b + 2 + 2 * OFFSET_ADDRESS_SIZE); pending_exact = 0; - b += 3; + b += 1 + OFFSET_ADDRESS_SIZE; /* The alternative before this one has a jump after it which gets executed if it gets matched. Adjust that @@ -2599,8 +3798,8 @@ regex_compile (pattern, size, syntax, bufp) to be filled in later either by next alternative or when know we're at the end of a series of alternatives. */ fixup_alt_jump = b; - GET_BUFFER_SPACE (3); - b += 3; + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); + b += 1 + OFFSET_ADDRESS_SIZE; laststart = 0; begalt = b; @@ -2612,8 +3811,7 @@ regex_compile (pattern, size, syntax, bufp) if (!(syntax & RE_INTERVALS) /* If we're at `\{' and it's not the open-interval operator. */ - || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) - || (p - 2 == pattern && p == pend)) + || (syntax & RE_NO_BK_BRACES)) goto normal_backslash; handle_interval: @@ -2623,57 +3821,43 @@ regex_compile (pattern, size, syntax, bufp) /* At least (most) this many matches must be made. */ int lower_bound = -1, upper_bound = -1; - beg_interval = p - 1; + /* Place in the uncompiled pattern (i.e., just after + the '{') to go back to if the interval is invalid. */ + const CHAR_TYPE *beg_interval = p; if (p == pend) - { - if (syntax & RE_NO_BK_BRACES) - goto unfetch_interval; - else - FREE_STACK_RETURN (REG_EBRACE); - } + goto invalid_interval; GET_UNSIGNED_NUMBER (lower_bound); if (c == ',') { GET_UNSIGNED_NUMBER (upper_bound); - if (upper_bound < 0) upper_bound = RE_DUP_MAX; + if (upper_bound < 0) + upper_bound = RE_DUP_MAX; } else /* Interval such as `{1}' => match exactly once. */ upper_bound = lower_bound; - if (lower_bound < 0 || upper_bound > RE_DUP_MAX - || lower_bound > upper_bound) - { - if (syntax & RE_NO_BK_BRACES) - goto unfetch_interval; - else - FREE_STACK_RETURN (REG_BADBR); - } + if (! (0 <= lower_bound && lower_bound <= upper_bound)) + goto invalid_interval; if (!(syntax & RE_NO_BK_BRACES)) { - if (c != '\\') FREE_STACK_RETURN (REG_EBRACE); - + if (c != '\\' || p == pend) + goto invalid_interval; PATFETCH (c); } if (c != '}') - { - if (syntax & RE_NO_BK_BRACES) - goto unfetch_interval; - else - FREE_STACK_RETURN (REG_BADBR); - } - - /* We just parsed a valid interval. */ + goto invalid_interval; /* If it's invalid to have no preceding re. */ if (!laststart) { - if (syntax & RE_CONTEXT_INVALID_OPS) + if (syntax & RE_CONTEXT_INVALID_OPS + && !(syntax & RE_INVALID_INTERVAL_ORD)) FREE_STACK_RETURN (REG_BADRPT); else if (syntax & RE_CONTEXT_INDEP_OPS) laststart = b; @@ -2681,14 +3865,22 @@ regex_compile (pattern, size, syntax, bufp) goto unfetch_interval; } + /* We just parsed a valid interval. */ + + if (RE_DUP_MAX < upper_bound) + FREE_STACK_RETURN (REG_BADBR); + /* If the upper bound is zero, don't want to succeed at all; jump from `laststart' to `b + 3', which will be - the end of the buffer after we insert the jump. */ + the end of the buffer after we insert the jump. */ + /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE' + instead of 'b + 3'. */ if (upper_bound == 0) { - GET_BUFFER_SPACE (3); - INSERT_JUMP (jump, laststart, b + 3); - b += 3; + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); + INSERT_JUMP (jump, laststart, b + 1 + + OFFSET_ADDRESS_SIZE); + b += 1 + OFFSET_ADDRESS_SIZE; } /* Otherwise, we have a nontrivial interval. When @@ -2703,7 +3895,8 @@ regex_compile (pattern, size, syntax, bufp) else { /* If the upper bound is > 1, we need to insert more at the end of the loop. */ - unsigned nbytes = 10 + (upper_bound > 1) * 10; + unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE + + (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE); GET_BUFFER_SPACE (nbytes); @@ -2713,16 +3906,21 @@ regex_compile (pattern, size, syntax, bufp) because `re_compile_fastmap' needs to know. Jump to the `jump_n' we might insert below. */ INSERT_JUMP2 (succeed_n, laststart, - b + 5 + (upper_bound > 1) * 5, - lower_bound); - b += 5; + b + 1 + 2 * OFFSET_ADDRESS_SIZE + + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE) + , lower_bound); + b += 1 + 2 * OFFSET_ADDRESS_SIZE; /* Code to initialize the lower bound. Insert before the `succeed_n'. The `5' is the last two bytes of this `set_number_at', plus 3 bytes of the following `succeed_n'. */ - insert_op2 (set_number_at, laststart, 5, lower_bound, b); - b += 5; + /* ifdef MBS_SUPPORT, The '1+2*OFFSET_ADDRESS_SIZE' + is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE' + of the following `succeed_n'. */ + insert_op2 (set_number_at, laststart, 1 + + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b); + b += 1 + 2 * OFFSET_ADDRESS_SIZE; if (upper_bound > 1) { /* More than one repetition is allowed, so @@ -2732,9 +3930,10 @@ regex_compile (pattern, size, syntax, bufp) When we've reached this during matching, we'll have matched the interval once, so jump back only `upper_bound - 1' times. */ - STORE_JUMP2 (jump_n, b, laststart + 5, + STORE_JUMP2 (jump_n, b, laststart + + 2 * OFFSET_ADDRESS_SIZE + 1, upper_bound - 1); - b += 5; + b += 1 + 2 * OFFSET_ADDRESS_SIZE; /* The location we want to set is the second parameter of the `jump_n'; that is `b-2' as @@ -2752,29 +3951,24 @@ regex_compile (pattern, size, syntax, bufp) reinitialize the bounds. */ insert_op2 (set_number_at, laststart, b - laststart, upper_bound - 1, b); - b += 5; + b += 1 + 2 * OFFSET_ADDRESS_SIZE; } } pending_exact = 0; - beg_interval = NULL; - } - break; - - unfetch_interval: - /* If an invalid interval, match the characters as literals. */ - assert (beg_interval); - p = beg_interval; - beg_interval = NULL; - - /* normal_char and normal_backslash need `c'. */ - PATFETCH (c); - - if (!(syntax & RE_NO_BK_BRACES)) - { - if (p > pattern && p[-1] == '\\') - goto normal_backslash; - } - goto normal_char; + break; + + invalid_interval: + if (!(syntax & RE_INVALID_INTERVAL_ORD)) + FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR); + unfetch_interval: + /* Match the characters as literals. */ + p = beg_interval; + c = '{'; + if (syntax & RE_NO_BK_BRACES) + goto normal_char; + else + goto normal_backslash; + } #ifdef emacs /* There is no way to specify the before_dot and after_dot @@ -2891,6 +4085,11 @@ regex_compile (pattern, size, syntax, bufp) normal_char: /* If no exactn currently being built. */ if (!pending_exact +#ifdef MBS_SUPPORT + /* If last exactn handle binary(or character) and + new exactn handle character(or binary). */ + || is_exactn_bin != is_binary[p - 1 - pattern] +#endif /* MBS_SUPPORT */ /* If last exactn not at current position. */ || pending_exact + *pending_exact + 1 != b @@ -2912,7 +4111,16 @@ regex_compile (pattern, size, syntax, bufp) laststart = b; +#ifdef MBS_SUPPORT + /* Is this exactn binary data or character? */ + is_exactn_bin = is_binary[p - 1 - pattern]; + if (is_exactn_bin) + BUF_PUSH_2 (exactn_bin, 0); + else + BUF_PUSH_2 (exactn, 0); +#else BUF_PUSH_2 (exactn, 0); +#endif /* MBS_SUPPORT */ pending_exact = b - 1; } @@ -2936,10 +4144,19 @@ regex_compile (pattern, size, syntax, bufp) if (syntax & RE_NO_POSIX_BACKTRACKING) BUF_PUSH (succeed); +#ifdef MBS_SUPPORT + free (pattern); + free (mbs_offset); + free (is_binary); +#endif free (compile_stack.stack); /* We have succeeded; set the length of the buffer. */ +#ifdef MBS_SUPPORT + bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR; +#else bufp->used = b - bufp->buffer; +#endif #ifdef DEBUG if (debug) @@ -2996,44 +4213,47 @@ regex_compile (pattern, size, syntax, bufp) /* Subroutines for `regex_compile'. */ /* Store OP at LOC followed by two-byte integer parameter ARG. */ +/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */ static void store_op1 (op, loc, arg) re_opcode_t op; - unsigned char *loc; + US_CHAR_TYPE *loc; int arg; { - *loc = (unsigned char) op; + *loc = (US_CHAR_TYPE) op; STORE_NUMBER (loc + 1, arg); } /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ +/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */ static void store_op2 (op, loc, arg1, arg2) re_opcode_t op; - unsigned char *loc; + US_CHAR_TYPE *loc; int arg1, arg2; { - *loc = (unsigned char) op; + *loc = (US_CHAR_TYPE) op; STORE_NUMBER (loc + 1, arg1); - STORE_NUMBER (loc + 3, arg2); + STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2); } /* Copy the bytes from LOC to END to open up three bytes of space at LOC for OP followed by two-byte integer parameter ARG. */ +/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */ static void insert_op1 (op, loc, arg, end) re_opcode_t op; - unsigned char *loc; + US_CHAR_TYPE *loc; int arg; - unsigned char *end; + US_CHAR_TYPE *end; { - register unsigned char *pfrom = end; - register unsigned char *pto = end + 3; + register US_CHAR_TYPE *pfrom = end; + register US_CHAR_TYPE *pto = end + 1 + OFFSET_ADDRESS_SIZE; while (pfrom != loc) *--pto = *--pfrom; @@ -3043,16 +4263,17 @@ insert_op1 (op, loc, arg, end) /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ +/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */ static void insert_op2 (op, loc, arg1, arg2, end) re_opcode_t op; - unsigned char *loc; + US_CHAR_TYPE *loc; int arg1, arg2; - unsigned char *end; + US_CHAR_TYPE *end; { - register unsigned char *pfrom = end; - register unsigned char *pto = end + 5; + register US_CHAR_TYPE *pfrom = end; + register US_CHAR_TYPE *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE; while (pfrom != loc) *--pto = *--pfrom; @@ -3067,10 +4288,10 @@ insert_op2 (op, loc, arg1, arg2, end) static boolean at_begline_loc_p (pattern, p, syntax) - const char *pattern, *p; + const CHAR_TYPE *pattern, *p; reg_syntax_t syntax; { - const char *prev = p - 2; + const CHAR_TYPE *prev = p - 2; boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; return @@ -3086,12 +4307,12 @@ at_begline_loc_p (pattern, p, syntax) static boolean at_endline_loc_p (p, pend, syntax) - const char *p, *pend; + const CHAR_TYPE *p, *pend; reg_syntax_t syntax; { - const char *next = p; + const CHAR_TYPE *next = p; boolean next_backslash = *next == '\\'; - const char *next_next = p + 1 < pend ? p + 1 : 0; + const CHAR_TYPE *next_next = p + 1 < pend ? p + 1 : 0; return /* Before a subexpression? */ @@ -3122,7 +4343,100 @@ group_in_compile_stack (compile_stack, regnum) return false; } +#ifdef MBS_SUPPORT +/* This insert space, which size is "num", into the pattern at "loc". + "end" must point the end of the allocated buffer. */ +static void +insert_space (num, loc, end) + int num; + CHAR_TYPE *loc; + CHAR_TYPE *end; +{ + register CHAR_TYPE *pto = end; + register CHAR_TYPE *pfrom = end - num; + + while (pfrom >= loc) + *pto-- = *pfrom--; +} +#endif /* MBS_SUPPORT */ + +#ifdef MBS_SUPPORT +static reg_errcode_t +compile_range (range_start_char, p_ptr, pend, translate, syntax, b, + char_set) + CHAR_TYPE range_start_char; + const CHAR_TYPE **p_ptr, *pend; + CHAR_TYPE *char_set, *b; + RE_TRANSLATE_TYPE translate; + reg_syntax_t syntax; +{ + const CHAR_TYPE *p = *p_ptr; + CHAR_TYPE range_start, range_end; + reg_errcode_t ret; +# ifdef _LIBC + uint32_t nrules; + uint32_t start_val, end_val; +# endif + if (p == pend) + return REG_ERANGE; + +# ifdef _LIBC + nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); + if (nrules != 0) + { + const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE, + _NL_COLLATE_COLLSEQWC); + const unsigned char *extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); + + if (range_start_char < -1) + { + /* range_start is a collating symbol. */ + int32_t *wextra; + /* Retreive the index and get collation sequence value. */ + wextra = (int32_t*)(extra + char_set[-range_start_char]); + start_val = wextra[1 + *wextra]; + } + else + start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char)); + + end_val = collseq_table_lookup (collseq, TRANSLATE (p[0])); + + /* Report an error if the range is empty and the syntax prohibits + this. */ + ret = ((syntax & RE_NO_EMPTY_RANGES) + && (start_val > end_val))? REG_ERANGE : REG_NOERROR; + /* Insert space to the end of the char_ranges. */ + insert_space(2, b - char_set[5] - 2, b - 1); + *(b - char_set[5] - 2) = (wchar_t)start_val; + *(b - char_set[5] - 1) = (wchar_t)end_val; + char_set[4]++; /* ranges_index */ + } + else +# endif + { + range_start = (range_start_char >= 0)? TRANSLATE (range_start_char): + range_start_char; + range_end = TRANSLATE (p[0]); + /* Report an error if the range is empty and the syntax prohibits + this. */ + ret = ((syntax & RE_NO_EMPTY_RANGES) + && (range_start > range_end))? REG_ERANGE : REG_NOERROR; + + /* Insert space to the end of the char_ranges. */ + insert_space(2, b - char_set[5] - 2, b - 1); + *(b - char_set[5] - 2) = range_start; + *(b - char_set[5] - 1) = range_end; + char_set[4]++; /* ranges_index */ + } + /* Have to increment the pointer into the pattern string, so the + caller isn't still at the ending character. */ + (*p_ptr)++; + + return ret; +} +#else /* Read the ending character of a range (in a bracket expression) from the uncompiled pattern *P_PTR (which ends at PEND). We assume the starting character is in `P[-2]'. (`P[-1]' is the character `-'.) @@ -3135,27 +4449,27 @@ group_in_compile_stack (compile_stack, regnum) `regex_compile' itself. */ static reg_errcode_t -compile_range (p_ptr, pend, translate, syntax, b) - const char **p_ptr, *pend; - RE_TRANSLATE_TYPE translate; - reg_syntax_t syntax; - unsigned char *b; +compile_range (range_start_char, p_ptr, pend, translate, syntax, b) + unsigned int range_start_char; + const char **p_ptr, *pend; + RE_TRANSLATE_TYPE translate; + reg_syntax_t syntax; + unsigned char *b; { unsigned this_char; - const char *p = *p_ptr; reg_errcode_t ret; - char range_start[2]; - char range_end[2]; +# if _LIBC + const unsigned char *collseq; + unsigned int start_colseq; + unsigned int end_colseq; +# else + unsigned end_char; +# endif if (p == pend) return REG_ERANGE; - /* Fetch the endpoints without translating them; the - appropriate translation is done in the bit-setting loop below. */ - range_start[0] = p[-2]; range_start[1] = '\0'; - range_end[0] = p[ 0]; range_end[1] = '\0'; - /* Have to increment the pointer into the pattern string, so the caller isn't still at the ending character. */ (*p_ptr)++; @@ -3163,22 +4477,44 @@ compile_range (p_ptr, pend, translate, syntax, b) /* Report an error if the range is empty and the syntax prohibits this. */ ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; - /* Here we see why `this_char' has to be larger than an `unsigned - char' -- we would otherwise go into an infinite - loop, since all characters <= 0xff. */ - for (this_char = 0; this_char <= (unsigned char) -1; this_char++) +# if _LIBC + collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_COLLSEQMB); + + start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)]; + end_colseq = collseq[(unsigned char) TRANSLATE (p[0])]; + for (this_char = 0; this_char <= (unsigned char) -1; ++this_char) { - char ch[2]; - ch[0] = this_char; ch[1] = '\0'; - if (strcoll (range_start, ch) <= 0 && strcoll (ch, range_end) <= 0) + unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)]; + + if (start_colseq <= this_colseq && this_colseq <= end_colseq) { SET_LIST_BIT (TRANSLATE (this_char)); ret = REG_NOERROR; } } +# else + /* Here we see why `this_char' has to be larger than an `unsigned + char' -- we would otherwise go into an infinite loop, since all + characters <= 0xff. */ + range_start_char = TRANSLATE (range_start_char); + /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE, + and some compilers cast it to int implicitly, so following for_loop + may fall to (almost) infinite loop. + e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff. + To avoid this, we cast p[0] to unsigned int and truncate it. */ + end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1)); + + for (this_char = range_start_char; this_char <= end_char; ++this_char) + { + SET_LIST_BIT (TRANSLATE (this_char)); + ret = REG_NOERROR; + } +# endif return ret; } +#endif /* MBS_SUPPORT */ /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible @@ -3193,6 +4529,21 @@ compile_range (p_ptr, pend, translate, syntax, b) Returns 0 if we succeed, -2 if an internal error. */ +#ifdef MBS_SUPPORT +/* local function for re_compile_fastmap. + truncate wchar_t character to char. */ +static unsigned char truncate_wchar (CHAR_TYPE c); + +static unsigned char +truncate_wchar (c) + CHAR_TYPE c; +{ + unsigned char buf[MB_LEN_MAX]; + int retval = wctomb(buf, c); + return retval > 0 ? buf[0] : (unsigned char)c; +} +#endif /* MBS_SUPPORT */ + int re_compile_fastmap (bufp) struct re_pattern_buffer *bufp; @@ -3206,9 +4557,17 @@ re_compile_fastmap (bufp) #endif register char *fastmap = bufp->fastmap; - unsigned char *pattern = bufp->buffer; - unsigned char *p = pattern; - register unsigned char *pend = pattern + bufp->used; + +#ifdef MBS_SUPPORT + /* We need to cast pattern to (wchar_t*), because we casted this compiled + pattern to (char*) in regex_compile. */ + US_CHAR_TYPE *pattern = (US_CHAR_TYPE*)bufp->buffer; + register US_CHAR_TYPE *pend = (US_CHAR_TYPE*) (bufp->buffer + bufp->used); +#else + US_CHAR_TYPE *pattern = bufp->buffer; + register US_CHAR_TYPE *pend = pattern + bufp->used; +#endif /* MBS_SUPPORT */ + US_CHAR_TYPE *p = pattern; #ifdef REL_ALLOC /* This holds the pointer to the failure stack, when @@ -3271,11 +4630,30 @@ re_compile_fastmap (bufp) /* Following are the cases which match a character. These end with `break'. */ +#ifdef MBS_SUPPORT + case exactn: + fastmap[truncate_wchar(p[1])] = 1; + break; + case exactn_bin: + fastmap[p[1]] = 1; + break; +#else case exactn: fastmap[p[1]] = 1; break; +#endif /* MBS_SUPPORT */ +#ifdef MBS_SUPPORT + /* It is hard to distinguish fastmap from (multi byte) characters + which depends on current locale. */ + case charset: + case charset_not: + case wordchar: + case notwordchar: + bufp->can_be_null = 1; + goto done; +#else case charset: for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) @@ -3306,7 +4684,7 @@ re_compile_fastmap (bufp) if (SYNTAX (j) != Sword) fastmap[j] = 1; break; - +#endif case anychar: { @@ -3436,13 +4814,13 @@ re_compile_fastmap (bufp) case succeed_n: /* Get to the number of times to succeed. */ - p += 2; + p += OFFSET_ADDRESS_SIZE; /* Increment p past the n for when k != 0. */ EXTRACT_NUMBER_AND_INCR (k, p); if (k == 0) { - p -= 4; + p -= 2 * OFFSET_ADDRESS_SIZE; succeed_n_p = true; /* Spaghetti code alert. */ goto handle_on_failure_jump; } @@ -3450,7 +4828,7 @@ re_compile_fastmap (bufp) case set_number_at: - p += 4; + p += 2 * OFFSET_ADDRESS_SIZE; continue; @@ -3527,7 +4905,7 @@ weak_alias (__re_set_registers, re_set_registers) /* Searching routines. */ /* Like re_search_2, below, but only one string is specified, and - doesn't let you say where to stop matching. */ + doesn't let you say where to stop matching. */ int re_search (bufp, string, size, startpos, range, regs) @@ -3658,9 +5036,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) } else /* Searching backwards. */ { - register char c = (size1 == 0 || startpos >= size1 - ? string2[startpos - size1] - : string1[startpos]); + register CHAR_TYPE c = (size1 == 0 || startpos >= size1 + ? string2[startpos - size1] + : string1[startpos]); if (!fastmap[(unsigned char) TRANSLATE (c)]) goto advance; @@ -3706,12 +5084,24 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) weak_alias (__re_search_2, re_search_2) #endif +#ifdef MBS_SUPPORT +/* This converts PTR, a pointer into one of the search wchar_t strings + `string1' and `string2' into an multibyte string offset from the + beginning of that string. We use mbs_offset to optimize. + See convert_mbs_to_wcs. */ +# define POINTER_TO_OFFSET(ptr) \ + (FIRST_STRING_P (ptr) \ + ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \ + : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \ + + csize1))) +#else /* This converts PTR, a pointer into one of the search strings `string1' and `string2' into an offset from the beginning of that string. */ -#define POINTER_TO_OFFSET(ptr) \ +# define POINTER_TO_OFFSET(ptr) \ (FIRST_STRING_P (ptr) \ ? ((regoff_t) ((ptr) - string1)) \ : ((regoff_t) ((ptr) - string2 + size1))) +#endif /* MBS_SUPPORT */ /* Macros for dealing with the split strings in re_match_2. */ @@ -3741,10 +5131,17 @@ weak_alias (__re_search_2, re_search_2) two special cases to check for: if past the end of string1, look at the first character in string2; and if before the beginning of string2, look at the last character in string1. */ -#define WORDCHAR_P(d) \ +#ifdef MBS_SUPPORT +/* Use internationalized API instead of SYNTAX. */ +# define WORDCHAR_P(d) \ + (iswalnum ((wint_t)((d) == end1 ? *string2 \ + : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0) +#else +# define WORDCHAR_P(d) \ (SYNTAX ((d) == end1 ? *string2 \ : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ == Sword) +#endif /* MBS_SUPPORT */ /* Disabled due to a compiler bug -- see comment at case wordbound */ #if 0 @@ -3758,7 +5155,26 @@ weak_alias (__re_search_2, re_search_2) /* Free everything we malloc. */ #ifdef MATCH_MAY_ALLOCATE # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL -# define FREE_VARIABLES() \ +# ifdef MBS_SUPPORT +# define FREE_VARIABLES() \ + do { \ + REGEX_FREE_STACK (fail_stack.stack); \ + FREE_VAR (regstart); \ + FREE_VAR (regend); \ + FREE_VAR (old_regstart); \ + FREE_VAR (old_regend); \ + FREE_VAR (best_regstart); \ + FREE_VAR (best_regend); \ + FREE_VAR (reg_info); \ + FREE_VAR (reg_dummy); \ + FREE_VAR (reg_info_dummy); \ + FREE_VAR (string1); \ + FREE_VAR (string2); \ + FREE_VAR (mbs_offset1); \ + FREE_VAR (mbs_offset2); \ + } while (0) +# else /* not MBS_SUPPORT */ +# define FREE_VARIABLES() \ do { \ REGEX_FREE_STACK (fail_stack.stack); \ FREE_VAR (regstart); \ @@ -3771,8 +5187,20 @@ weak_alias (__re_search_2, re_search_2) FREE_VAR (reg_dummy); \ FREE_VAR (reg_info_dummy); \ } while (0) +# endif /* MBS_SUPPORT */ #else -# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */ +# define FREE_VAR(var) if (var) free (var); var = NULL +# ifdef MBS_SUPPORT +# define FREE_VARIABLES() \ + do { \ + FREE_VAR (string1); \ + FREE_VAR (string2); \ + FREE_VAR (mbs_offset1); \ + FREE_VAR (mbs_offset2); \ + } while (0) +# else +# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */ +# endif /* MBS_SUPPORT */ #endif /* not MATCH_MAY_ALLOCATE */ /* These values must meet several constraints. They must not be valid @@ -3811,16 +5239,16 @@ weak_alias (__re_match, re_match) # endif #endif /* not emacs */ -static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p, - unsigned char *end, +static boolean group_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p, + US_CHAR_TYPE *end, register_info_type *reg_info)); -static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p, - unsigned char *end, +static boolean alt_match_null_string_p _RE_ARGS ((US_CHAR_TYPE *p, + US_CHAR_TYPE *end, register_info_type *reg_info)); -static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p, - unsigned char *end, +static boolean common_op_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p, + US_CHAR_TYPE *end, register_info_type *reg_info)); -static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2, +static int bcmp_translate _RE_ARGS ((const CHAR_TYPE *s1, const CHAR_TYPE *s2, int len, char *translate)); /* re_match_2 matches the compiled pattern in BUFP against the @@ -3858,38 +5286,97 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) weak_alias (__re_match_2, re_match_2) #endif +#ifdef MBS_SUPPORT + +static int count_mbs_length PARAMS ((int *, int)); + +/* This check the substring (from 0, to length) of the multibyte string, + to which offset_buffer correspond. And count how many wchar_t_characters + the substring occupy. We use offset_buffer to optimization. + See convert_mbs_to_wcs. */ + +static int +count_mbs_length(offset_buffer, length) + int *offset_buffer; + int length; +{ + int wcs_size; + + /* Check whether the size is valid. */ + if (length < 0) + return -1; + + if (offset_buffer == NULL) + return 0; + + for (wcs_size = 0 ; offset_buffer[wcs_size] != -1 ; wcs_size++) + { + if (offset_buffer[wcs_size] == length) + return wcs_size; + if (offset_buffer[wcs_size] > length) + /* It is a fragment of a wide character. */ + return -1; + } + + /* We reached at the sentinel. */ + return -1; +} +#endif /* MBS_SUPPORT */ + /* This is a separate function so that we can force an alloca cleanup afterwards. */ static int +#ifdef MBS_SUPPORT +re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos, regs, stop) + struct re_pattern_buffer *bufp; + const char *cstring1, *cstring2; + int csize1, csize2; +#else re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) struct re_pattern_buffer *bufp; const char *string1, *string2; int size1, size2; +#endif int pos; struct re_registers *regs; int stop; { /* General temporaries. */ int mcnt; - unsigned char *p1; + US_CHAR_TYPE *p1; +#ifdef MBS_SUPPORT + /* We need wchar_t* buffers correspond to string1, string2. */ + CHAR_TYPE *string1 = NULL, *string2 = NULL; + /* We need the size of wchar_t buffers correspond to csize1, csize2. */ + int size1 = 0, size2 = 0; + /* offset buffer for optimizatoin. See convert_mbs_to_wc. */ + int *mbs_offset1 = NULL, *mbs_offset2 = NULL; + /* They hold whether each wchar_t is binary data or not. */ + char *is_binary = NULL; +#endif /* MBS_SUPPORT */ /* Just past the end of the corresponding string. */ - const char *end1, *end2; + const CHAR_TYPE *end1, *end2; /* Pointers into string1 and string2, just past the last characters in each to consider matching. */ - const char *end_match_1, *end_match_2; + const CHAR_TYPE *end_match_1, *end_match_2; /* Where we are in the data, and the end of the current string. */ - const char *d, *dend; + const CHAR_TYPE *d, *dend; /* Where we are in the pattern, and the end of the pattern. */ - unsigned char *p = bufp->buffer; - register unsigned char *pend = p + bufp->used; +#ifdef MBS_SUPPORT + US_CHAR_TYPE *pattern, *p; + register US_CHAR_TYPE *pend; +#else + US_CHAR_TYPE *p = bufp->buffer; + register US_CHAR_TYPE *pend = p + bufp->used; +#endif /* MBS_SUPPORT */ /* Mark the opcode just after a start_memory, so we can test for an empty subpattern when we get to the stop_memory. */ - unsigned char *just_past_start_mem = 0; + US_CHAR_TYPE *just_past_start_mem = 0; /* We use this to map every character in the string. */ RE_TRANSLATE_TYPE translate = bufp->translate; @@ -3934,7 +5421,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) stopped matching the regnum-th subexpression. (The zeroth register keeps track of what the whole pattern matches.) */ #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ - const char **regstart, **regend; + const CHAR_TYPE **regstart, **regend; #endif /* If a group that's operated upon by a repetition operator fails to @@ -3943,7 +5430,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) are when we last see its open-group operator. Similarly for a register's end. */ #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ - const char **old_regstart, **old_regend; + const CHAR_TYPE **old_regstart, **old_regend; #endif /* The is_active field of reg_info helps us keep track of which (possibly @@ -3962,7 +5449,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) turn happens only if we have not yet matched the entire string. */ unsigned best_regs_set = false; #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ - const char **best_regstart, **best_regend; + const CHAR_TYPE **best_regstart, **best_regend; #endif /* Logically, this is `best_regend[0]'. But we don't want to have to @@ -3973,14 +5460,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) the end of the best match so far in a separate variable. We initialize this to NULL so that when we backtrack the first time and need to test it, it's not garbage. */ - const char *match_end = NULL; + const CHAR_TYPE *match_end = NULL; /* This helps SET_REGS_MATCHED avoid doing redundant work. */ int set_regs_matched_done = 0; /* Used when we pop values we don't care about. */ #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ - const char **reg_dummy; + const CHAR_TYPE **reg_dummy; register_info_type *reg_info_dummy; #endif @@ -4001,14 +5488,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) array indexing. We should fix this. */ if (bufp->re_nsub) { - regstart = REGEX_TALLOC (num_regs, const char *); - regend = REGEX_TALLOC (num_regs, const char *); - old_regstart = REGEX_TALLOC (num_regs, const char *); - old_regend = REGEX_TALLOC (num_regs, const char *); - best_regstart = REGEX_TALLOC (num_regs, const char *); - best_regend = REGEX_TALLOC (num_regs, const char *); + regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *); + regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *); + old_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *); + old_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *); + best_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *); + best_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *); reg_info = REGEX_TALLOC (num_regs, register_info_type); - reg_dummy = REGEX_TALLOC (num_regs, const char *); + reg_dummy = REGEX_TALLOC (num_regs, const CHAR_TYPE *); reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); if (!(regstart && regend && old_regstart && old_regend && reg_info @@ -4029,12 +5516,63 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) #endif /* MATCH_MAY_ALLOCATE */ /* The starting position is bogus. */ +#ifdef MBS_SUPPORT + if (pos < 0 || pos > csize1 + csize2) +#else if (pos < 0 || pos > size1 + size2) +#endif { FREE_VARIABLES (); return -1; } +#ifdef MBS_SUPPORT + /* Allocate wchar_t array for string1 and string2 and + fill them with converted string. */ + if (csize1 != 0) + { + string1 = REGEX_TALLOC (csize1 + 1, CHAR_TYPE); + mbs_offset1 = REGEX_TALLOC (csize1 + 1, int); + is_binary = REGEX_TALLOC (csize1 + 1, char); + if (!string1 || !mbs_offset1 || !is_binary) + { + FREE_VAR (string1); + FREE_VAR (mbs_offset1); + FREE_VAR (is_binary); + return -2; + } + size1 = convert_mbs_to_wcs(string1, cstring1, csize1, + mbs_offset1, is_binary); + string1[size1] = L'\0'; /* for a sentinel */ + FREE_VAR (is_binary); + } + if (csize2 != 0) + { + string2 = REGEX_TALLOC (csize2 + 1, CHAR_TYPE); + mbs_offset2 = REGEX_TALLOC (csize2 + 1, int); + is_binary = REGEX_TALLOC (csize2 + 1, char); + if (!string2 || !mbs_offset2 || !is_binary) + { + FREE_VAR (string1); + FREE_VAR (mbs_offset1); + FREE_VAR (string2); + FREE_VAR (mbs_offset2); + FREE_VAR (is_binary); + return -2; + } + size2 = convert_mbs_to_wcs(string2, cstring2, csize2, + mbs_offset2, is_binary); + string2[size2] = L'\0'; /* for a sentinel */ + FREE_VAR (is_binary); + } + + /* We need to cast pattern to (wchar_t*), because we casted this compiled + pattern to (char*) in regex_compile. */ + p = pattern = (CHAR_TYPE*)bufp->buffer; + pend = (CHAR_TYPE*)(bufp->buffer + bufp->used); + +#endif /* MBS_SUPPORT */ + /* Initialize subexpression text positions to -1 to mark ones that no start_memory/stop_memory has been seen for. Also initialize the register information struct. */ @@ -4062,6 +5600,25 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) end2 = string2 + size2; /* Compute where to stop matching, within the two strings. */ +#ifdef MBS_SUPPORT + if (stop <= csize1) + { + mcnt = count_mbs_length(mbs_offset1, stop); + end_match_1 = string1 + mcnt; + end_match_2 = string2; + } + else + { + end_match_1 = end1; + mcnt = count_mbs_length(mbs_offset2, stop-csize1); + end_match_2 = string2 + mcnt; + } + if (mcnt < 0) + { /* count_mbs_length return error. */ + FREE_VARIABLES (); + return -1; + } +#else if (stop <= size1) { end_match_1 = string1 + stop; @@ -4072,6 +5629,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) end_match_1 = end1; end_match_2 = string2 + stop - size1; } +#endif /* MBS_SUPPORT */ /* `p' scans through the pattern as `d' scans through the data. `dend' is the end of the input string that `d' points within. `d' @@ -4079,6 +5637,26 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) this happens before fetching; therefore, at the beginning of the loop, `d' can be pointing at the end of a string, but it cannot equal `string2'. */ +#ifdef MBS_SUPPORT + if (size1 > 0 && pos <= csize1) + { + mcnt = count_mbs_length(mbs_offset1, pos); + d = string1 + mcnt; + dend = end_match_1; + } + else + { + mcnt = count_mbs_length(mbs_offset2, pos-csize1); + d = string2 + mcnt; + dend = end_match_2; + } + + if (mcnt < 0) + { /* count_mbs_length return error. */ + FREE_VARIABLES (); + return -1; + } +#else if (size1 > 0 && pos <= size1) { d = string1 + pos; @@ -4089,6 +5667,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) d = string2 + pos - size1; dend = end_match_2; } +#endif /* MBS_SUPPORT */ DEBUG_PRINT1 ("The compiled pattern is:\n"); DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); @@ -4178,11 +5757,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) succeed_label: DEBUG_PRINT1 ("Accepting match.\n"); - /* If caller wants register contents data back, do it. */ if (regs && !bufp->no_sub) { - /* Have the register data arrays been allocated? */ + /* Have the register data arrays been allocated? */ if (bufp->regs_allocated == REGS_UNALLOCATED) { /* No. So allocate them with malloc. We need one extra element beyond `num_regs' for the `-1' marker @@ -4226,9 +5804,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) if (regs->num_regs > 0) { regs->start[0] = pos; +#ifdef MBS_SUPPORT + if (MATCHING_IN_FIRST_STRING) + regs->end[0] = mbs_offset1 != NULL ? + mbs_offset1[d-string1] : 0; + else + regs->end[0] = csize1 + (mbs_offset2 != NULL ? + mbs_offset2[d-string2] : 0); +#else regs->end[0] = (MATCHING_IN_FIRST_STRING ? ((regoff_t) (d - string1)) : ((regoff_t) (d - string2 + size1))); +#endif /* MBS_SUPPORT */ } /* Go through the first `min (num_regs, regs->num_regs)' @@ -4261,9 +5848,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) nfailure_points_pushed - nfailure_points_popped); DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); +#ifdef MBS_SUPPORT + if (MATCHING_IN_FIRST_STRING) + mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0; + else + mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) + + csize1; + mcnt -= pos; +#else mcnt = d - pos - (MATCHING_IN_FIRST_STRING ? string1 : string2 - size1); +#endif /* MBS_SUPPORT */ DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); @@ -4288,6 +5884,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) byte in the pattern defines n, and the n bytes after that are the characters to match. */ case exactn: +#ifdef MBS_SUPPORT + case exactn_bin: +#endif mcnt = *p++; DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); @@ -4298,9 +5897,23 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) do { PREFETCH (); - if ((unsigned char) translate[(unsigned char) *d++] - != (unsigned char) *p++) +#ifdef MBS_SUPPORT + if (*d <= 0xff) + { + if ((US_CHAR_TYPE) translate[(unsigned char) *d++] + != (US_CHAR_TYPE) *p++) + goto fail; + } + else + { + if (*d++ != (CHAR_TYPE) *p++) + goto fail; + } +#else + if ((US_CHAR_TYPE) translate[(unsigned char) *d++] + != (US_CHAR_TYPE) *p++) goto fail; +#endif /* MBS_SUPPORT */ } while (--mcnt); } @@ -4309,7 +5922,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) do { PREFETCH (); - if (*d++ != (char) *p++) goto fail; + if (*d++ != (CHAR_TYPE) *p++) goto fail; } while (--mcnt); } @@ -4328,7 +5941,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) goto fail; SET_REGS_MATCHED (); - DEBUG_PRINT2 (" Matched `%d'.\n", *d); + DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d); d++; break; @@ -4336,14 +5949,358 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) case charset: case charset_not: { - register unsigned char c; + register US_CHAR_TYPE c; +#ifdef MBS_SUPPORT + unsigned int i, char_class_length, coll_symbol_length, + equiv_class_length, ranges_length, chars_length, length; + CHAR_TYPE *workp, *workp2, *charset_top; +#define WORK_BUFFER_SIZE 128 + CHAR_TYPE str_buf[WORK_BUFFER_SIZE]; +# ifdef _LIBC + uint32_t nrules; +# endif /* _LIBC */ +#endif /* MBS_SUPPORT */ boolean not = (re_opcode_t) *(p - 1) == charset_not; DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); - PREFETCH (); c = TRANSLATE (*d); /* The character to match. */ +#ifdef MBS_SUPPORT +# ifdef _LIBC + nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); +# endif /* _LIBC */ + charset_top = p - 1; + char_class_length = *p++; + coll_symbol_length = *p++; + equiv_class_length = *p++; + ranges_length = *p++; + chars_length = *p++; + /* p points charset[6], so the address of the next instruction + (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'], + where l=length of char_classes, m=length of collating_symbol, + n=equivalence_class, o=length of char_range, + p'=length of character. */ + workp = p; + /* Update p to indicate the next instruction. */ + p += char_class_length + coll_symbol_length+ equiv_class_length + + 2*ranges_length + chars_length; + + /* match with char_class? */ + for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE) + { + wctype_t wctype; + uintptr_t alignedp = ((uintptr_t)workp + + __alignof__(wctype_t) - 1) + & ~(uintptr_t)(__alignof__(wctype_t) - 1); + wctype = *((wctype_t*)alignedp); + workp += CHAR_CLASS_SIZE; + if (iswctype((wint_t)c, wctype)) + goto char_set_matched; + } + + /* match with collating_symbol? */ +# ifdef _LIBC + if (nrules != 0) + { + const unsigned char *extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); + + for (workp2 = workp + coll_symbol_length ; workp < workp2 ; + workp++) + { + int32_t *wextra; + wextra = (int32_t*)(extra + *workp++); + for (i = 0; i < *wextra; ++i) + if (TRANSLATE(d[i]) != wextra[1 + i]) + break; + + if (i == *wextra) + { + /* Update d, however d will be incremented at + char_set_matched:, we decrement d here. */ + d += i - 1; + goto char_set_matched; + } + } + } + else /* (nrules == 0) */ +# endif + /* If we can't look up collation data, we use wcscoll + instead. */ + { + for (workp2 = workp + coll_symbol_length ; workp < workp2 ;) + { + const CHAR_TYPE *backup_d = d, *backup_dend = dend; + length = wcslen(workp); + + /* If wcscoll(the collating symbol, whole string) > 0, + any substring of the string never match with the + collating symbol. */ + if (wcscoll(workp, d) > 0) + { + workp += length + 1; + continue; + } + + /* First, we compare the collating symbol with + the first character of the string. + If it don't match, we add the next character to + the compare buffer in turn. */ + for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++) + { + int match; + if (d == dend) + { + if (dend == end_match_2) + break; + d = string2; + dend = end_match_2; + } + + /* add next character to the compare buffer. */ + str_buf[i] = TRANSLATE(*d); + str_buf[i+1] = '\0'; + + match = wcscoll(workp, str_buf); + if (match == 0) + goto char_set_matched; + + if (match < 0) + /* (str_buf > workp) indicate (str_buf + X > workp), + because for all X (str_buf + X > str_buf). + So we don't need continue this loop. */ + break; + + /* Otherwise(str_buf < workp), + (str_buf+next_character) may equals (workp). + So we continue this loop. */ + } + /* not matched */ + d = backup_d; + dend = backup_dend; + workp += length + 1; + } + } + /* match with equivalence_class? */ +# ifdef _LIBC + if (nrules != 0) + { + const CHAR_TYPE *backup_d = d, *backup_dend = dend; + /* Try to match the equivalence class against + those known to the collate implementation. */ + const int32_t *table; + const int32_t *weights; + const int32_t *extra; + const int32_t *indirect; + int32_t idx, idx2; + wint_t *cp; + size_t len; + + /* This #include defines a local function! */ +# include <locale/weightwc.h> + + table = (const int32_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC); + weights = (const wint_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC); + extra = (const wint_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC); + indirect = (const int32_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC); + + /* Write 1 collating element to str_buf, and + get its index. */ + idx2 = 0; + + for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++) + { + cp = (wint_t*)str_buf; + if (d == dend) + { + if (dend == end_match_2) + break; + d = string2; + dend = end_match_2; + } + str_buf[i] = TRANSLATE(*(d+i)); + str_buf[i+1] = '\0'; /* sentinel */ + idx2 = findidx ((const wint_t**)&cp); + } + + /* Update d, however d will be incremented at + char_set_matched:, we decrement d here. */ + d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1); + if (d >= dend) + { + if (dend == end_match_2) + d = dend; + else + { + d = string2; + dend = end_match_2; + } + } + + len = weights[idx2]; + for (workp2 = workp + equiv_class_length ; workp < workp2 ; + workp++) + { + idx = (int32_t)*workp; + /* We already checked idx != 0 in regex_compile. */ + + if (idx2 != 0 && len == weights[idx]) + { + int cnt = 0; + while (cnt < len && (weights[idx + 1 + cnt] + == weights[idx2 + 1 + cnt])) + ++cnt; + + if (cnt == len) + goto char_set_matched; + } + } + /* not matched */ + d = backup_d; + dend = backup_dend; + } + else /* (nrules == 0) */ +# endif + /* If we can't look up collation data, we use wcscoll + instead. */ + { + for (workp2 = workp + equiv_class_length ; workp < workp2 ;) + { + const CHAR_TYPE *backup_d = d, *backup_dend = dend; + length = wcslen(workp); + + /* If wcscoll(the collating symbol, whole string) > 0, + any substring of the string never match with the + collating symbol. */ + if (wcscoll(workp, d) > 0) + { + workp += length + 1; + break; + } + + /* First, we compare the equivalence class with + the first character of the string. + If it don't match, we add the next character to + the compare buffer in turn. */ + for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++) + { + int match; + if (d == dend) + { + if (dend == end_match_2) + break; + d = string2; + dend = end_match_2; + } + + /* add next character to the compare buffer. */ + str_buf[i] = TRANSLATE(*d); + str_buf[i+1] = '\0'; + + match = wcscoll(workp, str_buf); + + if (match == 0) + goto char_set_matched; + + if (match < 0) + /* (str_buf > workp) indicate (str_buf + X > workp), + because for all X (str_buf + X > str_buf). + So we don't need continue this loop. */ + break; + + /* Otherwise(str_buf < workp), + (str_buf+next_character) may equals (workp). + So we continue this loop. */ + } + /* not matched */ + d = backup_d; + dend = backup_dend; + workp += length + 1; + } + } + + /* match with char_range? */ +#ifdef _LIBC + if (nrules != 0) + { + uint32_t collseqval; + const char *collseq = (const char *) + _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC); + + collseqval = collseq_table_lookup (collseq, c); + + for (; workp < p - chars_length ;) + { + uint32_t start_val, end_val; + + /* We already compute the collation sequence value + of the characters (or collating symbols). */ + start_val = (uint32_t) *workp++; /* range_start */ + end_val = (uint32_t) *workp++; /* range_end */ + + if (start_val <= collseqval && collseqval <= end_val) + goto char_set_matched; + } + } + else +#endif + { + /* We set range_start_char at str_buf[0], range_end_char + at str_buf[4], and compared char at str_buf[2]. */ + str_buf[1] = 0; + str_buf[2] = c; + str_buf[3] = 0; + str_buf[5] = 0; + for (; workp < p - chars_length ;) + { + wchar_t *range_start_char, *range_end_char; + + /* match if (range_start_char <= c <= range_end_char). */ + + /* If range_start(or end) < 0, we assume -range_start(end) + is the offset of the collating symbol which is specified + as the character of the range start(end). */ + + /* range_start */ + if (*workp < 0) + range_start_char = charset_top - (*workp++); + else + { + str_buf[0] = *workp++; + range_start_char = str_buf; + } + + /* range_end */ + if (*workp < 0) + range_end_char = charset_top - (*workp++); + else + { + str_buf[4] = *workp++; + range_end_char = str_buf + 4; + } + + if (wcscoll(range_start_char, str_buf+2) <= 0 && + wcscoll(str_buf+2, range_end_char) <= 0) + + goto char_set_matched; + } + } + + /* match with char? */ + for (; workp < p ; workp++) + if (c == *workp) + goto char_set_matched; + + not = !not; + + char_set_matched: + if (not) goto fail; +#else /* Cast to `unsigned' instead of `unsigned char' in case the bit list is a full 32 bytes long. */ if (c < (unsigned) (*p * BYTEWIDTH) @@ -4353,7 +6310,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) p += 1 + *p; if (!not) goto fail; - +#undef WORK_BUFFER_SIZE +#endif /* MBS_SUPPORT */ SET_REGS_MATCHED (); d++; break; @@ -4366,7 +6324,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) matched within the group is recorded (in the internal registers data structure) under the register number. */ case start_memory: - DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); + DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n", + (long int) *p, (long int) p[1]); /* Find out if this group can match the empty string. */ p1 = p; /* To send to group_match_null_string_p. */ @@ -4414,7 +6373,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) arguments are the same as start_memory's: the register number, and the number of inner groups. */ case stop_memory: - DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); + DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n", + (long int) *p, (long int) p[1]); /* We need to save the string position the last time we were at this close-group operator in case the group is operated @@ -4448,7 +6408,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) it isn't necessarily one less than now: consider (a(b)c(d(e)f)g). When group 3 ends, after the f), the new highest active register is 1. */ - unsigned char r = *p - 1; + US_CHAR_TYPE r = *p - 1; while (r > 0 && !IS_ACTIVE (reg_info[r])) r--; @@ -4491,7 +6451,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) case dummy_failure_jump: EXTRACT_NUMBER_AND_INCR (mcnt, p1); if (is_a_jump_n) - p1 += 2; + p1 += OFFSET_ADDRESS_SIZE; break; default: @@ -4505,7 +6465,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) by forcing a failure after pushing on the stack the on_failure_jump's jump in the pattern, and d. */ if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump - && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) + && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory + && p1[2+OFFSET_ADDRESS_SIZE] == *p) { /* If this group ever matched anything, then restore what its registers were before trying this last @@ -4551,7 +6512,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) followed by the numeric value of <digit> as the register number. */ case duplicate: { - register const char *d2, *dend2; + register const CHAR_TYPE *d2, *dend2; int regno = *p++; /* Get which register to match against. */ DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); @@ -4601,7 +6562,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) past them. */ if (translate ? bcmp_translate (d, d2, mcnt, translate) - : memcmp (d, d2, mcnt)) + : memcmp (d, d2, mcnt*sizeof(US_CHAR_TYPE))) goto fail; d += mcnt, d2 += mcnt; @@ -4757,7 +6718,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); { - register unsigned char *p2 = p; + register US_CHAR_TYPE *p2 = p; /* Compare the beginning of the repeat with what in the pattern follows its end. If we can establish that there @@ -4782,9 +6743,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) && ((re_opcode_t) *p2 == stop_memory || (re_opcode_t) *p2 == start_memory)) p2 += 3; - else if (p2 + 6 < pend + else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend && (re_opcode_t) *p2 == dummy_failure_jump) - p2 += 6; + p2 += 2 + 2 * OFFSET_ADDRESS_SIZE; else break; } @@ -4800,30 +6761,48 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Consider what happens when matching ":\(.*\)" against ":/". I don't really understand this code yet. */ - p[-3] = (unsigned char) pop_failure_jump; + p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE) + pop_failure_jump; DEBUG_PRINT1 (" End of pattern: change to `pop_failure_jump'.\n"); } else if ((re_opcode_t) *p2 == exactn +#ifdef MBS_SUPPORT + || (re_opcode_t) *p2 == exactn_bin +#endif || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) { - register unsigned char c - = *p2 == (unsigned char) endline ? '\n' : p2[2]; + register US_CHAR_TYPE c + = *p2 == (US_CHAR_TYPE) endline ? '\n' : p2[2]; - if ((re_opcode_t) p1[3] == exactn && p1[5] != c) + if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn +#ifdef MBS_SUPPORT + || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin +#endif + ) && p1[3+OFFSET_ADDRESS_SIZE] != c) { - p[-3] = (unsigned char) pop_failure_jump; - DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", - c, p1[5]); + p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE) + pop_failure_jump; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX != 1) + DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n", + (wint_t) c, + (wint_t) p1[3+OFFSET_ADDRESS_SIZE]); + else +#endif + DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", + (char) c, + (char) p1[3+OFFSET_ADDRESS_SIZE]); } +#ifndef MBS_SUPPORT else if ((re_opcode_t) p1[3] == charset || (re_opcode_t) p1[3] == charset_not) { int not = (re_opcode_t) p1[3] == charset_not; - if (c < (unsigned char) (p1[4] * BYTEWIDTH) + if (c < (unsigned) (p1[4] * BYTEWIDTH) && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) not = !not; @@ -4835,7 +6814,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); } } +#endif /* not MBS_SUPPORT */ } +#ifndef MBS_SUPPORT else if ((re_opcode_t) *p2 == charset) { /* We win if the first character of the loop is not part @@ -4884,11 +6865,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) } } } +#endif /* not MBS_SUPPORT */ } - p -= 2; /* Point at relative address again. */ + p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */ if ((re_opcode_t) p[-1] != pop_failure_jump) { - p[-1] = (unsigned char) jump; + p[-1] = (US_CHAR_TYPE) jump; DEBUG_PRINT1 (" Match => jump.\n"); goto unconditional_jump; } @@ -4909,8 +6891,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) register from the stack, since lowest will == highest in `pop_failure_point'. */ active_reg_t dummy_low_reg, dummy_high_reg; - unsigned char *pdummy; - const char *sdummy; + US_CHAR_TYPE *pdummy = NULL; + const CHAR_TYPE *sdummy = NULL; DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); POP_FAILURE_POINT (sdummy, pdummy, @@ -4975,7 +6957,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Have to succeed matching what follows at least n times. After that, handle like `on_failure_jump'. */ case succeed_n: - EXTRACT_NUMBER (mcnt, p + 2); + EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE); DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); assert (mcnt >= 0); @@ -4983,46 +6965,58 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) if (mcnt > 0) { mcnt--; - p += 2; + p += OFFSET_ADDRESS_SIZE; STORE_NUMBER_AND_INCR (p, mcnt); #ifdef _LIBC - DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt); + DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE + , mcnt); #else - DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt); + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE + , mcnt); #endif } else if (mcnt == 0) { #ifdef _LIBC - DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2); + DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", + p + OFFSET_ADDRESS_SIZE); #else - DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); -#endif - p[2] = (unsigned char) no_op; - p[3] = (unsigned char) no_op; + DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", + p + OFFSET_ADDRESS_SIZE); +#endif /* _LIBC */ + +#ifdef MBS_SUPPORT + p[1] = (US_CHAR_TYPE) no_op; +#else + p[2] = (US_CHAR_TYPE) no_op; + p[3] = (US_CHAR_TYPE) no_op; +#endif /* MBS_SUPPORT */ goto on_failure; } break; case jump_n: - EXTRACT_NUMBER (mcnt, p + 2); + EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE); DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); /* Originally, this is how many times we CAN jump. */ if (mcnt) { mcnt--; - STORE_NUMBER (p + 2, mcnt); + STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt); + #ifdef _LIBC - DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt); + DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE, + mcnt); #else - DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt); -#endif + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE, + mcnt); +#endif /* _LIBC */ goto unconditional_jump; } /* If don't have to jump any more, skip over the rest of command. */ else - p += 4; + p += 2 * OFFSET_ADDRESS_SIZE; break; case set_number_at: @@ -5254,12 +7248,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) static boolean group_match_null_string_p (p, end, reg_info) - unsigned char **p, *end; + US_CHAR_TYPE **p, *end; register_info_type *reg_info; { int mcnt; /* Point to after the args to the start_memory. */ - unsigned char *p1 = *p + 2; + US_CHAR_TYPE *p1 = *p + 2; while (p1 < end) { @@ -5297,14 +7291,16 @@ group_match_null_string_p (p, end, reg_info) with an on_failure_jump (see above) that jumps to right past a jump_past_alt. */ - while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) + while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] == + jump_past_alt) { /* `mcnt' holds how many bytes long the alternative is, including the ending `jump_past_alt' and its number. */ - if (!alt_match_null_string_p (p1, p1 + mcnt - 3, - reg_info)) + if (!alt_match_null_string_p (p1, p1 + mcnt - + (1 + OFFSET_ADDRESS_SIZE), + reg_info)) return false; /* Move to right after this alternative, including the @@ -5320,10 +7316,11 @@ group_match_null_string_p (p, end, reg_info) alternative that starts with an on_failure_jump. */ p1++; EXTRACT_NUMBER_AND_INCR (mcnt, p1); - if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) + if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] != + jump_past_alt) { /* Get to the beginning of the n-th alternative. */ - p1 -= 3; + p1 -= 1 + OFFSET_ADDRESS_SIZE; break; } } @@ -5331,7 +7328,7 @@ group_match_null_string_p (p, end, reg_info) /* Deal with the last alternative: go back and get number of the `jump_past_alt' just before it. `mcnt' contains the length of the alternative. */ - EXTRACT_NUMBER (mcnt, p1 - 2); + EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE); if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) return false; @@ -5363,11 +7360,11 @@ group_match_null_string_p (p, end, reg_info) static boolean alt_match_null_string_p (p, end, reg_info) - unsigned char *p, *end; + US_CHAR_TYPE *p, *end; register_info_type *reg_info; { int mcnt; - unsigned char *p1 = p; + US_CHAR_TYPE *p1 = p; while (p1 < end) { @@ -5400,13 +7397,13 @@ alt_match_null_string_p (p, end, reg_info) static boolean common_op_match_null_string_p (p, end, reg_info) - unsigned char **p, *end; + US_CHAR_TYPE **p, *end; register_info_type *reg_info; { int mcnt; boolean ret; int reg_no; - unsigned char *p1 = *p; + US_CHAR_TYPE *p1 = *p; switch ((re_opcode_t) *p1++) { @@ -5452,12 +7449,12 @@ common_op_match_null_string_p (p, end, reg_info) case succeed_n: /* Get to the number of times to succeed. */ - p1 += 2; + p1 += OFFSET_ADDRESS_SIZE; EXTRACT_NUMBER_AND_INCR (mcnt, p1); if (mcnt == 0) { - p1 -= 4; + p1 -= 2 * OFFSET_ADDRESS_SIZE; EXTRACT_NUMBER_AND_INCR (mcnt, p1); p1 += mcnt; } @@ -5471,7 +7468,7 @@ common_op_match_null_string_p (p, end, reg_info) break; case set_number_at: - p1 += 4; + p1 += 2 * OFFSET_ADDRESS_SIZE; default: /* All other opcodes mean we cannot match the empty string. */ @@ -5488,15 +7485,21 @@ common_op_match_null_string_p (p, end, reg_info) static int bcmp_translate (s1, s2, len, translate) - const char *s1, *s2; + const CHAR_TYPE *s1, *s2; register int len; RE_TRANSLATE_TYPE translate; { - register const unsigned char *p1 = (const unsigned char *) s1; - register const unsigned char *p2 = (const unsigned char *) s2; + register const US_CHAR_TYPE *p1 = (const US_CHAR_TYPE *) s1; + register const US_CHAR_TYPE *p2 = (const US_CHAR_TYPE *) s2; while (len) { +#ifdef MBS_SUPPORT + if (((*p1<=0xff)?translate[*p1++]:*p1++) + != ((*p2<=0xff)?translate[*p2++]:*p2++)) + return 1; +#else if (translate[*p1++] != translate[*p2++]) return 1; +#endif /* MBS_SUPPORT */ len--; } return 0; @@ -5716,7 +7719,7 @@ regcomp (preg, pattern, cflags) buffer. */ if (re_compile_fastmap (preg) == -2) { - /* Some error occured while computing the fastmap, just forget + /* Some error occurred while computing the fastmap, just forget about it. */ free (preg->fastmap); preg->fastmap = NULL; |