diff options
author | ale <ale@FreeBSD.org> | 2007-05-20 08:27:29 +0000 |
---|---|---|
committer | ale <ale@FreeBSD.org> | 2007-05-20 08:27:29 +0000 |
commit | 6c3301ebfbe0baad2a18f3d750ceb299c059da09 (patch) | |
tree | 462b73bc9f45977728c82653dd595b1a6dced232 /devel/php5-pcre | |
parent | 8eb4d8387c1d957325489cb4f1bebdd1e29e3b73 (diff) | |
download | FreeBSD-ports-6c3301ebfbe0baad2a18f3d750ceb299c059da09.zip FreeBSD-ports-6c3301ebfbe0baad2a18f3d750ceb299c059da09.tar.gz |
Update to 5.2.2 release.
PR: ports/112527
Submitted by: Nick Barkas<snb@threerings.net>
Approved by: portmgr (linimon)
Diffstat (limited to 'devel/php5-pcre')
-rw-r--r-- | devel/php5-pcre/Makefile | 2 | ||||
-rw-r--r-- | devel/php5-pcre/files/patch-pcre-7.0 | 10262 |
2 files changed, 0 insertions, 10264 deletions
diff --git a/devel/php5-pcre/Makefile b/devel/php5-pcre/Makefile index 5c28e9f..bfa9970 100644 --- a/devel/php5-pcre/Makefile +++ b/devel/php5-pcre/Makefile @@ -5,8 +5,6 @@ # $FreeBSD$ # -PORTREVISION= 5 - CATEGORIES= devel MASTERDIR= ${.CURDIR}/../../lang/php5 diff --git a/devel/php5-pcre/files/patch-pcre-7.0 b/devel/php5-pcre/files/patch-pcre-7.0 deleted file mode 100644 index 724ae26..0000000 --- a/devel/php5-pcre/files/patch-pcre-7.0 +++ /dev/null @@ -1,10262 +0,0 @@ -diff -ruN ../pcre.orig/config.m4 ./config.m4 ---- ../pcre.orig/config.m4 Mon Dec 4 19:01:53 2006 -+++ ./config.m4 Fri Feb 9 22:31:18 2007 -@@ -13,7 +13,7 @@ - - if test "$PHP_PCRE_REGEX" != "no"; then - if test "$PHP_PCRE_REGEX" = "yes"; then -- PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -I@ext_srcdir@/pcrelib) -+ PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DEBCDIC=0 -I@ext_srcdir@/pcrelib) - PHP_ADD_BUILD_DIR($ext_builddir/pcrelib) - PHP_INSTALL_HEADERS([ext/pcre], [php_pcre.h pcrelib/]) - AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ]) -diff -ruN ../pcre.orig/pcrelib/dftables.c ./pcrelib/dftables.c ---- ../pcre.orig/pcrelib/dftables.c Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/dftables.c Fri Feb 9 22:31:19 2007 -@@ -6,7 +6,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -86,7 +86,16 @@ - fprintf(f, - "This file contains the default tables for characters with codes less than\n" - "128 (ASCII characters). These tables are used when no external tables are\n" -- "passed to PCRE. */\n\n" -+ "passed to PCRE.\n\n"); -+fprintf(f, -+ "The following #include is present because without it gcc 4.x may remove\n" -+ "the array definition from the final binary if PCRE is built into a static\n" -+ "library and dead code stripping is activated. This leads to link errors.\n" -+ "Pulling in the header ensures that the array gets flagged as \"someone\n" -+ "outside this compilation unit might reference this\" and so it will always\n" -+ "be supplied to the linker. */\n\n" -+ "#include \"pcre_internal.h\"\n\n"); -+fprintf(f, - "const unsigned char _pcre_default_tables[] = {\n\n" - "/* This table is a lower casing table. */\n\n"); - -diff -ruN ../pcre.orig/pcrelib/pcre.h ./pcrelib/pcre.h ---- ../pcre.orig/pcrelib/pcre.h Wed Jan 3 19:32:27 2007 -+++ ./pcrelib/pcre.h Fri Feb 9 22:31:19 2007 -@@ -5,7 +5,7 @@ - /* This is the public header file for the PCRE library, to be #included by - applications that call the PCRE functions. - -- Copyright (c) 1997-2005 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -38,7 +38,7 @@ - - #ifndef _PCRE_H - #define _PCRE_H -- -+ - #include "php_compat.h" - - /* The current PCRE version information. */ -@@ -54,10 +54,10 @@ - cannot run ./configure. As it now stands, this file need not be edited in that - circumstance. */ - --#define PCRE_MAJOR 6 --#define PCRE_MINOR 7 -+#define PCRE_MAJOR 7 -+#define PCRE_MINOR 0 - #define PCRE_PRERELEASE --#define PCRE_DATE 04-Jul-2006 -+#define PCRE_DATE 18-Dec-2006 - - /* Win32 uses DLL by default; it needs special stuff for exported functions - when building PCRE. */ -@@ -120,6 +120,7 @@ - #define PCRE_NEWLINE_CR 0x00100000 - #define PCRE_NEWLINE_LF 0x00200000 - #define PCRE_NEWLINE_CRLF 0x00300000 -+#define PCRE_NEWLINE_ANY 0x00400000 - - /* Exec-time and get/set-time error codes */ - -@@ -127,7 +128,8 @@ - #define PCRE_ERROR_NULL (-2) - #define PCRE_ERROR_BADOPTION (-3) - #define PCRE_ERROR_BADMAGIC (-4) --#define PCRE_ERROR_UNKNOWN_NODE (-5) -+#define PCRE_ERROR_UNKNOWN_OPCODE (-5) -+#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */ - #define PCRE_ERROR_NOMEMORY (-6) - #define PCRE_ERROR_NOSUBSTRING (-7) - #define PCRE_ERROR_MATCHLIMIT (-8) -@@ -144,6 +146,8 @@ - #define PCRE_ERROR_DFA_WSSIZE (-19) - #define PCRE_ERROR_DFA_RECURSE (-20) - #define PCRE_ERROR_RECURSIONLIMIT (-21) -+#define PCRE_ERROR_NULLWSLIMIT (-22) -+#define PCRE_ERROR_BADNEWLINE (-23) - - /* Request types for pcre_fullinfo() */ - -diff -ruN ../pcre.orig/pcrelib/pcre_compile.c ./pcrelib/pcre_compile.c ---- ../pcre.orig/pcrelib/pcre_compile.c Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/pcre_compile.c Fri Feb 9 22:31:19 2007 -@@ -6,7 +6,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -42,7 +42,11 @@ - supporting internal functions that are not used by other modules. */ - - --#define NLBLOCK cd /* The block containing newline information */ -+#define NLBLOCK cd /* Block containing newline information */ -+#define PSSTART start_pattern /* Field containing processed string start */ -+#define PSEND end_pattern /* Field containing processed string end */ -+ -+ - #include "pcre_internal.h" - - -@@ -54,18 +58,23 @@ - #endif - - -- - /************************************************* - * Code parameters and static tables * - *************************************************/ - --/* Maximum number of items on the nested bracket stacks at compile time. This --applies to the nesting of all kinds of parentheses. It does not limit --un-nested, non-capturing parentheses. This number can be made bigger if --necessary - it is used to dimension one int and one unsigned char vector at --compile time. */ -+/* This value specifies the size of stack workspace that is used during the -+first pre-compile phase that determines how much memory is required. The regex -+is partly compiled into this space, but the compiled parts are discarded as -+soon as they can be, so that hopefully there will never be an overrun. The code -+does, however, check for an overrun. The largest amount I've seen used is 218, -+so this number is very generous. -+ -+The same workspace is used during the second, actual compile phase for -+remembering forward references to groups so that they can be filled in at the -+end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE -+is 4 there is plenty of room. */ - --#define BRASTACK_SIZE 200 -+#define COMPILE_WORK_SIZE (4096) - - - /* Table for handling escaped characters in the range '0'-'z'. Positive returns -@@ -79,10 +88,10 @@ - 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ - '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ - 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ ---ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ -+-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ - -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ - '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ -- 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */ -+ 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */ - -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ - 0, 0, -ESC_z /* x - z */ - }; -@@ -98,7 +107,7 @@ - /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', - /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, - /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, --/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p, -+/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p, - /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, - /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, - /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, -@@ -107,7 +116,7 @@ - /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, - /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, - /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, --/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0, -+/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, - /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, - /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, - /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, -@@ -156,8 +165,13 @@ - }; - - -+#define STRING(a) # a -+#define XSTRING(s) STRING(s) -+ - /* The texts of compile-time error messages. These are "char *" because they --are passed to the outside world. */ -+are passed to the outside world. Do not ever re-use any error number, because -+they are documented. Always add a new error instead. Messages marked DEAD below -+are no longer used. */ - - static const char *error_texts[] = { - "no error", -@@ -172,7 +186,7 @@ - "range out of order in character class", - "nothing to repeat", - /* 10 */ -- "operand of unlimited repeat could match the empty string", -+ "operand of unlimited repeat could match the empty string", /** DEAD **/ - "internal error: unexpected repeat", - "unrecognized character after (?", - "POSIX named classes are supported only within a class", -@@ -182,7 +196,7 @@ - "erroffset passed as NULL", - "unknown option bit(s) set", - "missing ) after comment", -- "parentheses nested too deeply", -+ "parentheses nested too deeply", /** DEAD **/ - /* 20 */ - "regular expression too large", - "failed to get memory", -@@ -199,7 +213,7 @@ - "unknown POSIX class name", - "POSIX collating elements are not supported", - "this version of PCRE is not compiled with PCRE_UTF8 support", -- "spare error", -+ "spare error", /** DEAD **/ - "character value in \\x{...} sequence is too large", - /* 35 */ - "invalid condition (?(0)", -@@ -210,18 +224,25 @@ - /* 40 */ - "recursive call could loop indefinitely", - "unrecognized character after (?P", -- "syntax error after (?P", -+ "syntax error in subpattern name (missing terminator)", - "two named subpatterns have the same name", - "invalid UTF-8 string", - /* 45 */ - "support for \\P, \\p, and \\X has not been compiled", - "malformed \\P or \\p sequence", - "unknown property name after \\P or \\p", -- "subpattern name is too long (maximum 32 characters)", -- "too many named subpatterns (maximum 10,000)", -+ "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)", -+ "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")", - /* 50 */ - "repeated subpattern is too long", -- "octal value is greater than \\377 (not in UTF-8 mode)" -+ "octal value is greater than \\377 (not in UTF-8 mode)", -+ "internal error: overran compiling workspace", -+ "internal error: previously-checked referenced subpattern not found", -+ "DEFINE group contains more than one branch", -+ /* 55 */ -+ "repeating a DEFINE group is not allowed", -+ "inconsistent NEWLINE options", -+ "\\g is not followed by an (optionally braced) non-zero number" - }; - - -@@ -352,8 +373,8 @@ - /* Definition to allow mutual recursion */ - - static BOOL -- compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int, -- int *, int *, branch_chain *, compile_data *); -+ compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *, -+ int *, branch_chain *, compile_data *, int *); - - - -@@ -363,9 +384,11 @@ - - /* This function is called when a \ has been encountered. It either returns a - positive value for a simple escape such as \n, or a negative value which --encodes one of the more complicated things such as \d. When UTF-8 is enabled, --a positive value greater than 255 may be returned. On entry, ptr is pointing at --the \. On exit, it is on the final character of the escape sequence. -+encodes one of the more complicated things such as \d. A backreference to group -+n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When -+UTF-8 is enabled, a positive value greater than 255 may be returned. On entry, -+ptr is pointing at the \. On exit, it is on the final character of the escape -+sequence. - - Arguments: - ptrptr points to the pattern position pointer -@@ -412,6 +435,8 @@ - else - { - const uschar *oldptr; -+ BOOL braced, negated; -+ - switch (c) - { - /* A number of Perl escapes are not handled by PCRE. We give an explicit -@@ -425,6 +450,48 @@ - *errorcodeptr = ERR37; - break; - -+ /* \g must be followed by a number, either plain or braced. If positive, it -+ is an absolute backreference. If negative, it is a relative backreference. -+ This is a Perl 5.10 feature. */ -+ -+ case 'g': -+ if (ptr[1] == '{') -+ { -+ braced = TRUE; -+ ptr++; -+ } -+ else braced = FALSE; -+ -+ if (ptr[1] == '-') -+ { -+ negated = TRUE; -+ ptr++; -+ } -+ else negated = FALSE; -+ -+ c = 0; -+ while ((digitab[ptr[1]] & ctype_digit) != 0) -+ c = c * 10 + *(++ptr) - '0'; -+ -+ if (c == 0 || (braced && *(++ptr) != '}')) -+ { -+ *errorcodeptr = ERR57; -+ return 0; -+ } -+ -+ if (negated) -+ { -+ if (c > bracount) -+ { -+ *errorcodeptr = ERR15; -+ return 0; -+ } -+ c = bracount - (c - 1); -+ } -+ -+ c = -(ESC_REF + c); -+ break; -+ - /* The handling of escape sequences consisting of a string of digits - starting with one that is not zero is not straightforward. By experiment, - the way Perl works seems to be as follows: -@@ -532,7 +599,9 @@ - } - break; - -- /* Other special escapes not starting with a digit are straightforward */ -+ /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. -+ This coding is ASCII-specific, but then the whole concept of \cx is -+ ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ - - case 'c': - c = *(++ptr); -@@ -542,10 +611,6 @@ - return 0; - } - -- /* A letter is upper-cased; then the 0x40 bit is flipped. This coding -- is ASCII-specific, but then the whole concept of \cx is ASCII-specific. -- (However, an EBCDIC equivalent has now been added.) */ -- - #if !EBCDIC /* ASCII coding */ - if (c >= 'a' && c <= 'z') c -= 32; - c ^= 0x40; -@@ -772,42 +837,111 @@ - - - /************************************************* --* Find forward referenced named subpattern * -+* Find forward referenced subpattern * - *************************************************/ - --/* This function scans along a pattern looking for capturing subpatterns, and --counting them. If it finds a named pattern that matches the name it is given, --it returns its number. This is used for forward references to named --subpatterns. We know that if (?P< is encountered, the name will be terminated --by '>' because that is checked in the first pass. -+/* This function scans along a pattern's text looking for capturing -+subpatterns, and counting them. If it finds a named pattern that matches the -+name it is given, it returns its number. Alternatively, if the name is NULL, it -+returns when it reaches a given numbered subpattern. This is used for forward -+references to subpatterns. We know that if (?P< is encountered, the name will -+be terminated by '>' because that is checked in the first pass. - - Arguments: -- pointer current position in the pattern -- count current count of capturing parens -- name name to seek -- namelen name length -+ ptr current position in the pattern -+ count current count of capturing parens so far encountered -+ name name to seek, or NULL if seeking a numbered subpattern -+ lorn name length, or subpattern number if name is NULL -+ xmode TRUE if we are in /x mode - - Returns: the number of the named subpattern, or -1 if not found - */ - - static int --find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen) -+find_parens(const uschar *ptr, int count, const uschar *name, int lorn, -+ BOOL xmode) - { - const uschar *thisname; -+ - for (; *ptr != 0; ptr++) - { -- if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; } -+ int term; -+ -+ /* Skip over backslashed characters and also entire \Q...\E */ -+ -+ if (*ptr == '\\') -+ { -+ if (*(++ptr) == 0) return -1; -+ if (*ptr == 'Q') for (;;) -+ { -+ while (*(++ptr) != 0 && *ptr != '\\'); -+ if (*ptr == 0) return -1; -+ if (*(++ptr) == 'E') break; -+ } -+ continue; -+ } -+ -+ /* Skip over character classes */ -+ -+ if (*ptr == '[') -+ { -+ while (*(++ptr) != ']') -+ { -+ if (*ptr == '\\') -+ { -+ if (*(++ptr) == 0) return -1; -+ if (*ptr == 'Q') for (;;) -+ { -+ while (*(++ptr) != 0 && *ptr != '\\'); -+ if (*ptr == 0) return -1; -+ if (*(++ptr) == 'E') break; -+ } -+ continue; -+ } -+ } -+ continue; -+ } -+ -+ /* Skip comments in /x mode */ -+ -+ if (xmode && *ptr == '#') -+ { -+ while (*(++ptr) != 0 && *ptr != '\n'); -+ if (*ptr == 0) return -1; -+ continue; -+ } -+ -+ /* An opening parens must now be a real metacharacter */ -+ - if (*ptr != '(') continue; -- if (ptr[1] != '?') { count++; continue; } -- if (ptr[2] == '(') { ptr += 2; continue; } -- if (ptr[2] != 'P' || ptr[3] != '<') continue; -+ if (ptr[1] != '?') -+ { -+ count++; -+ if (name == NULL && count == lorn) return count; -+ continue; -+ } -+ -+ ptr += 2; -+ if (*ptr == 'P') ptr++; /* Allow optional P */ -+ -+ /* We have to disambiguate (?<! and (?<= from (?<name> */ -+ -+ if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') && -+ *ptr != '\'') -+ continue; -+ - count++; -- ptr += 4; -+ -+ if (name == NULL && count == lorn) return count; -+ term = *ptr++; -+ if (term == '<') term = '>'; - thisname = ptr; -- while (*ptr != '>') ptr++; -- if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0) -+ while (*ptr != term) ptr++; -+ if (name != NULL && lorn == ptr - thisname && -+ strncmp((const char *)name, (const char *)thisname, lorn) == 0) - return count; - } -+ - return -1; - } - -@@ -862,7 +996,8 @@ - - case OP_CALLOUT: - case OP_CREF: -- case OP_BRANUMBER: -+ case OP_RREF: -+ case OP_DEF: - code += _pcre_OP_lengths[*code]; - break; - -@@ -907,14 +1042,14 @@ - { - int d; - register int op = *cc; -- if (op >= OP_BRA) op = OP_BRA; - - switch (op) - { -+ case OP_CBRA: - case OP_BRA: - case OP_ONCE: - case OP_COND: -- d = find_fixedlength(cc, options); -+ d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options); - if (d < 0) return d; - branchlength += d; - do cc += GET(cc, 1); while (*cc == OP_ALT); -@@ -949,8 +1084,9 @@ - /* Skip over things that don't match chars */ - - case OP_REVERSE: -- case OP_BRANUMBER: - case OP_CREF: -+ case OP_RREF: -+ case OP_DEF: - case OP_OPT: - case OP_CALLOUT: - case OP_SOD: -@@ -1094,21 +1230,18 @@ - - if (c == OP_XCLASS) code += GET(code, 1); - -- /* Handle bracketed group */ -+ /* Handle capturing bracket */ - -- else if (c > OP_BRA) -+ else if (c == OP_CBRA) - { -- int n = c - OP_BRA; -- if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE); -+ int n = GET2(code, 1+LINK_SIZE); - if (n == number) return (uschar *)code; -- code += _pcre_OP_lengths[OP_BRA]; -+ code += _pcre_OP_lengths[c]; - } - -- /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes -- that are followed by a character may be followed by a multi-byte character. -- The length in the table is a minimum, so we have to scan along to skip the -- extra bytes. All opcodes are less than 128, so we can use relatively -- efficient code. */ -+ /* In UTF-8 mode, opcodes that are followed by a character may be followed by -+ a multi-byte character. The length in the table is a minimum, so we have to -+ arrange to skip the extra bytes. */ - - else - { -@@ -1120,13 +1253,17 @@ - case OP_EXACT: - case OP_UPTO: - case OP_MINUPTO: -+ case OP_POSUPTO: - case OP_STAR: - case OP_MINSTAR: -+ case OP_POSSTAR: - case OP_PLUS: - case OP_MINPLUS: -+ case OP_POSPLUS: - case OP_QUERY: - case OP_MINQUERY: -- while ((*code & 0xc0) == 0x80) code++; -+ case OP_POSQUERY: -+ if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; - break; - } - } -@@ -1164,18 +1301,10 @@ - - if (c == OP_XCLASS) code += GET(code, 1); - -- /* All bracketed groups have the same length. */ -- -- else if (c > OP_BRA) -- { -- code += _pcre_OP_lengths[OP_BRA]; -- } -- - /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes - that are followed by a character may be followed by a multi-byte character. -- The length in the table is a minimum, so we have to scan along to skip the -- extra bytes. All opcodes are less than 128, so we can use relatively -- efficient code. */ -+ The length in the table is a minimum, so we have to arrange to skip the extra -+ bytes. */ - - else - { -@@ -1187,13 +1316,17 @@ - case OP_EXACT: - case OP_UPTO: - case OP_MINUPTO: -+ case OP_POSUPTO: - case OP_STAR: - case OP_MINSTAR: -+ case OP_POSSTAR: - case OP_PLUS: - case OP_MINPLUS: -+ case OP_POSPLUS: - case OP_QUERY: - case OP_MINQUERY: -- while ((*code & 0xc0) == 0x80) code++; -+ case OP_POSQUERY: -+ if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; - break; - } - } -@@ -1207,10 +1340,11 @@ - *************************************************/ - - /* This function scans through a branch of a compiled pattern to see whether it --can match the empty string or not. It is called only from could_be_empty() --below. Note that first_significant_code() skips over assertions. If we hit an --unclosed bracket, we return "empty" - this means we've struck an inner bracket --whose current branch will already have been scanned. -+can match the empty string or not. It is called from could_be_empty() -+below and from compile_branch() when checking for an unlimited repeat of a -+group that can match nothing. Note that first_significant_code() skips over -+assertions. If we hit an unclosed bracket, we return "empty" - this means we've -+struck an inner bracket whose current branch will already have been scanned. - - Arguments: - code points to start of search -@@ -1224,7 +1358,7 @@ - could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8) - { - register int c; --for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE); -+for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE); - code < endcode; - code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE)) - { -@@ -1232,7 +1366,7 @@ - - c = *code; - -- if (c >= OP_BRA) -+ if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE) - { - BOOL empty_branch; - if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ -@@ -1248,11 +1382,18 @@ - } - while (*code == OP_ALT); - if (!empty_branch) return FALSE; /* All branches are non-empty */ -- code += 1 + LINK_SIZE; -- c = *code; -+ -+ /* Move past the KET and fudge things so that the increment in the "for" -+ above has no effect. */ -+ -+ c = OP_END; -+ code += 1 + LINK_SIZE - _pcre_OP_lengths[c]; -+ continue; - } - -- else switch (c) -+ /* Handle the other opcodes */ -+ -+ switch (c) - { - /* Check for quantifiers after a class */ - -@@ -1308,12 +1449,15 @@ - case OP_NOT: - case OP_PLUS: - case OP_MINPLUS: -+ case OP_POSPLUS: - case OP_EXACT: - case OP_NOTPLUS: - case OP_NOTMINPLUS: -+ case OP_NOTPOSPLUS: - case OP_NOTEXACT: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: -+ case OP_TYPEPOSPLUS: - case OP_TYPEEXACT: - return FALSE; - -@@ -1325,16 +1469,19 @@ - case OP_ALT: - return TRUE; - -- /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be -- followed by a multibyte character */ -+ /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, -+ MINUPTO, and POSUPTO may be followed by a multibyte character */ - - #ifdef SUPPORT_UTF8 - case OP_STAR: - case OP_MINSTAR: -+ case OP_POSSTAR: - case OP_QUERY: - case OP_MINQUERY: -+ case OP_POSQUERY: - case OP_UPTO: - case OP_MINUPTO: -+ case OP_POSUPTO: - if (utf8) while ((code[2] & 0xc0) == 0x80) code++; - break; - #endif -@@ -1452,26 +1599,57 @@ - optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before - it, after it has been compiled. This means that any OP_RECURSE items within it - that refer to the group itself or any contained groups have to have their --offsets adjusted. That is the job of this function. Before it is called, the --partially compiled regex must be temporarily terminated with OP_END. -+offsets adjusted. That one of the jobs of this function. Before it is called, -+the partially compiled regex must be temporarily terminated with OP_END. -+ -+This function has been extended with the possibility of forward references for -+recursions and subroutine calls. It must also check the list of such references -+for the group we are dealing with. If it finds that one of the recursions in -+the current group is on this list, it adjusts the offset in the list, not the -+value in the reference (which is a group number). - - Arguments: - group points to the start of the group - adjust the amount by which the group is to be moved - utf8 TRUE in UTF-8 mode - cd contains pointers to tables etc. -+ save_hwm the hwm forward reference pointer at the start of the group - - Returns: nothing - */ - - static void --adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd) -+adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, -+ uschar *save_hwm) - { - uschar *ptr = group; - while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) - { -- int offset = GET(ptr, 1); -- if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); -+ int offset; -+ uschar *hc; -+ -+ /* See if this recursion is on the forward reference list. If so, adjust the -+ reference. */ -+ -+ for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) -+ { -+ offset = GET(hc, 0); -+ if (cd->start_code + offset == ptr + 1) -+ { -+ PUT(hc, 0, offset + adjust); -+ break; -+ } -+ } -+ -+ /* Otherwise, adjust the recursion offset if it's after the start of this -+ group. */ -+ -+ if (hc >= cd->hwm) -+ { -+ offset = GET(ptr, 1); -+ if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); -+ } -+ - ptr += 1 + LINK_SIZE; - } - } -@@ -1550,12 +1728,13 @@ - */ - - static BOOL --get_othercase_range(int *cptr, int d, int *ocptr, int *odptr) -+get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, -+ unsigned int *odptr) - { --int c, othercase, next; -+unsigned int c, othercase, next; - - for (c = *cptr; c <= d; c++) -- { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; } -+ { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; } - - if (c > d) return FALSE; - -@@ -1576,17 +1755,249 @@ - #endif /* SUPPORT_UCP */ - - -+ -+/************************************************* -+* Check if auto-possessifying is possible * -+*************************************************/ -+ -+/* This function is called for unlimited repeats of certain items, to see -+whether the next thing could possibly match the repeated item. If not, it makes -+sense to automatically possessify the repeated item. -+ -+Arguments: -+ op_code the repeated op code -+ this data for this item, depends on the opcode -+ utf8 TRUE in UTF-8 mode -+ utf8_char used for utf8 character bytes, NULL if not relevant -+ ptr next character in pattern -+ options options bits -+ cd contains pointers to tables etc. -+ -+Returns: TRUE if possessifying is wanted -+*/ -+ -+static BOOL -+check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, -+ const uschar *ptr, int options, compile_data *cd) -+{ -+int next; -+ -+/* Skip whitespace and comments in extended mode */ -+ -+if ((options & PCRE_EXTENDED) != 0) -+ { -+ for (;;) -+ { -+ while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; -+ if (*ptr == '#') -+ { -+ while (*(++ptr) != 0) -+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } -+ } -+ else break; -+ } -+ } -+ -+/* If the next item is one that we can handle, get its value. A non-negative -+value is a character, a negative value is an escape value. */ -+ -+if (*ptr == '\\') -+ { -+ int temperrorcode = 0; -+ next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); -+ if (temperrorcode != 0) return FALSE; -+ ptr++; /* Point after the escape sequence */ -+ } -+ -+else if ((cd->ctypes[*ptr] & ctype_meta) == 0) -+ { -+#ifdef SUPPORT_UTF8 -+ if (utf8) { GETCHARINC(next, ptr); } else -+#endif -+ next = *ptr++; -+ } -+ -+else return FALSE; -+ -+/* Skip whitespace and comments in extended mode */ -+ -+if ((options & PCRE_EXTENDED) != 0) -+ { -+ for (;;) -+ { -+ while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; -+ if (*ptr == '#') -+ { -+ while (*(++ptr) != 0) -+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } -+ } -+ else break; -+ } -+ } -+ -+/* If the next thing is itself optional, we have to give up. */ -+ -+if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0) -+ return FALSE; -+ -+/* Now compare the next item with the previous opcode. If the previous is a -+positive single character match, "item" either contains the character or, if -+"item" is greater than 127 in utf8 mode, the character's bytes are in -+utf8_char. */ -+ -+ -+/* Handle cases when the next item is a character. */ -+ -+if (next >= 0) switch(op_code) -+ { -+ case OP_CHAR: -+#ifdef SUPPORT_UTF8 -+ if (utf8 && item > 127) { GETCHAR(item, utf8_char); } -+#endif -+ return item != next; -+ -+ /* For CHARNC (caseless character) we must check the other case. If we have -+ Unicode property support, we can use it to test the other case of -+ high-valued characters. */ -+ -+ case OP_CHARNC: -+#ifdef SUPPORT_UTF8 -+ if (utf8 && item > 127) { GETCHAR(item, utf8_char); } -+#endif -+ if (item == next) return FALSE; -+#ifdef SUPPORT_UTF8 -+ if (utf8) -+ { -+ unsigned int othercase; -+ if (next < 128) othercase = cd->fcc[next]; else -+#ifdef SUPPORT_UCP -+ othercase = _pcre_ucp_othercase((unsigned int)next); -+#else -+ othercase = NOTACHAR; -+#endif -+ return (unsigned int)item != othercase; -+ } -+ else -+#endif /* SUPPORT_UTF8 */ -+ return (item != cd->fcc[next]); /* Non-UTF-8 mode */ -+ -+ /* For OP_NOT, "item" must be a single-byte character. */ -+ -+ case OP_NOT: -+ if (next < 0) return FALSE; /* Not a character */ -+ if (item == next) return TRUE; -+ if ((options & PCRE_CASELESS) == 0) return FALSE; -+#ifdef SUPPORT_UTF8 -+ if (utf8) -+ { -+ unsigned int othercase; -+ if (next < 128) othercase = cd->fcc[next]; else -+#ifdef SUPPORT_UCP -+ othercase = _pcre_ucp_othercase(next); -+#else -+ othercase = NOTACHAR; -+#endif -+ return (unsigned int)item == othercase; -+ } -+ else -+#endif /* SUPPORT_UTF8 */ -+ return (item == cd->fcc[next]); /* Non-UTF-8 mode */ -+ -+ case OP_DIGIT: -+ return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; -+ -+ case OP_NOT_DIGIT: -+ return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; -+ -+ case OP_WHITESPACE: -+ return next > 127 || (cd->ctypes[next] & ctype_space) == 0; -+ -+ case OP_NOT_WHITESPACE: -+ return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; -+ -+ case OP_WORDCHAR: -+ return next > 127 || (cd->ctypes[next] & ctype_word) == 0; -+ -+ case OP_NOT_WORDCHAR: -+ return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; -+ -+ default: -+ return FALSE; -+ } -+ -+ -+/* Handle the case when the next item is \d, \s, etc. */ -+ -+switch(op_code) -+ { -+ case OP_CHAR: -+ case OP_CHARNC: -+#ifdef SUPPORT_UTF8 -+ if (utf8 && item > 127) { GETCHAR(item, utf8_char); } -+#endif -+ switch(-next) -+ { -+ case ESC_d: -+ return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; -+ -+ case ESC_D: -+ return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; -+ -+ case ESC_s: -+ return item > 127 || (cd->ctypes[item] & ctype_space) == 0; -+ -+ case ESC_S: -+ return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; -+ -+ case ESC_w: -+ return item > 127 || (cd->ctypes[item] & ctype_word) == 0; -+ -+ case ESC_W: -+ return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; -+ -+ default: -+ return FALSE; -+ } -+ -+ case OP_DIGIT: -+ return next == -ESC_D || next == -ESC_s || next == -ESC_W; -+ -+ case OP_NOT_DIGIT: -+ return next == -ESC_d; -+ -+ case OP_WHITESPACE: -+ return next == -ESC_S || next == -ESC_d || next == -ESC_w; -+ -+ case OP_NOT_WHITESPACE: -+ return next == -ESC_s; -+ -+ case OP_WORDCHAR: -+ return next == -ESC_W || next == -ESC_s; -+ -+ case OP_NOT_WORDCHAR: -+ return next == -ESC_w || next == -ESC_d; -+ -+ default: -+ return FALSE; -+ } -+ -+/* Control does not reach here */ -+} -+ -+ -+ - /************************************************* - * Compile one branch * - *************************************************/ - --/* Scan the pattern, compiling it into the code vector. If the options are -+/* Scan the pattern, compiling it into the a vector. If the options are - changed during the branch, the pointer is used to change the external options --bits. -+bits. This function is used during the pre-compile phase when we are trying -+to find out the amount of memory needed, as well as during the real compile -+phase. The value of lengthptr distinguishes the two phases. - - Arguments: - optionsptr pointer to the option bits -- brackets points to number of extracting brackets used - codeptr points to the pointer to the current code point - ptrptr points to the current pattern pointer - errorcodeptr points to error code variable -@@ -1594,15 +2005,17 @@ - reqbyteptr set to the last literal character required, else < 0 - bcptr points to current branch chain - cd contains pointers to tables etc. -+ lengthptr NULL during the real compile phase -+ points to length accumulator during pre-compile phase - - Returns: TRUE on success - FALSE, with *errorcodeptr set non-zero on error - */ - - static BOOL --compile_branch(int *optionsptr, int *brackets, uschar **codeptr, -- const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr, -- int *reqbyteptr, branch_chain *bcptr, compile_data *cd) -+compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, -+ int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, -+ compile_data *cd, int *lengthptr) - { - int repeat_type, op_type; - int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ -@@ -1613,8 +2026,11 @@ - int req_caseopt, reqvary, tempreqvary; - int options = *optionsptr; - int after_manual_callout = 0; -+int length_prevgroup = 0; - register int c; - register uschar *code = *codeptr; -+uschar *last_code = code; -+uschar *orig_code = code; - uschar *tempcode; - BOOL inescq = FALSE; - BOOL groupsetfirstbyte = FALSE; -@@ -1622,6 +2038,7 @@ - const uschar *tempptr; - uschar *previous = NULL; - uschar *previous_callout = NULL; -+uschar *save_hwm = NULL; - uschar classbits[32]; - - #ifdef SUPPORT_UTF8 -@@ -1631,6 +2048,11 @@ - uschar utf8_char[6]; - #else - BOOL utf8 = FALSE; -+uschar *utf8_char = NULL; -+#endif -+ -+#ifdef DEBUG -+if (lengthptr != NULL) DPRINTF((">> start branch\n")); - #endif - - /* Set up the default and non-default settings for greediness */ -@@ -1664,6 +2086,7 @@ - BOOL negate_class; - BOOL possessive_quantifier; - BOOL is_quantifier; -+ BOOL is_recurse; - int class_charcount; - int class_lastchar; - int newoptions; -@@ -1671,13 +2094,68 @@ - int skipbytes; - int subreqbyte; - int subfirstbyte; -+ int terminator; - int mclength; - uschar mcbuffer[8]; - -- /* Next byte in the pattern */ -+ /* Get next byte in the pattern */ - - c = *ptr; - -+ /* If we are in the pre-compile phase, accumulate the length used for the -+ previous cycle of this loop. */ -+ -+ if (lengthptr != NULL) -+ { -+#ifdef DEBUG -+ if (code > cd->hwm) cd->hwm = code; /* High water info */ -+#endif -+ if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */ -+ { -+ *errorcodeptr = ERR52; -+ goto FAILED; -+ } -+ -+ /* There is at least one situation where code goes backwards: this is the -+ case of a zero quantifier after a class (e.g. [ab]{0}). At compile time, -+ the class is simply eliminated. However, it is created first, so we have to -+ allow memory for it. Therefore, don't ever reduce the length at this point. -+ */ -+ -+ if (code < last_code) code = last_code; -+ *lengthptr += code - last_code; -+ DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); -+ -+ /* If "previous" is set and it is not at the start of the work space, move -+ it back to there, in order to avoid filling up the work space. Otherwise, -+ if "previous" is NULL, reset the current code pointer to the start. */ -+ -+ if (previous != NULL) -+ { -+ if (previous > orig_code) -+ { -+ memmove(orig_code, previous, code - previous); -+ code -= previous - orig_code; -+ previous = orig_code; -+ } -+ } -+ else code = orig_code; -+ -+ /* Remember where this code item starts so we can pick up the length -+ next time round. */ -+ -+ last_code = code; -+ } -+ -+ /* In the real compile phase, just check the workspace used by the forward -+ reference list. */ -+ -+ else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE) -+ { -+ *errorcodeptr = ERR52; -+ goto FAILED; -+ } -+ - /* If in \Q...\E, check for the end; if not, we have a literal */ - - if (inescq && c != 0) -@@ -1692,7 +2170,8 @@ - { - if (previous_callout != NULL) - { -- complete_callout(previous_callout, ptr, cd); -+ if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ -+ complete_callout(previous_callout, ptr, cd); - previous_callout = NULL; - } - if ((options & PCRE_AUTO_CALLOUT) != 0) -@@ -1713,7 +2192,8 @@ - if (!is_quantifier && previous_callout != NULL && - after_manual_callout-- <= 0) - { -- complete_callout(previous_callout, ptr, cd); -+ if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ -+ complete_callout(previous_callout, ptr, cd); - previous_callout = NULL; - } - -@@ -1724,12 +2204,12 @@ - if ((cd->ctypes[c] & ctype_space) != 0) continue; - if (c == '#') - { -- while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break; -- if (*ptr != 0) -+ while (*(++ptr) != 0) - { -- ptr += cd->nllen - 1; -- continue; -+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } - } -+ if (*ptr != 0) continue; -+ - /* Else fall through to handle end of string */ - c = 0; - } -@@ -1745,17 +2225,23 @@ - - switch(c) - { -- /* The branch terminates at end of string, |, or ). */ -- -- case 0: -- case '|': -+ /* ===================================================================*/ -+ case 0: /* The branch terminates at string end */ -+ case '|': /* or | or ) */ - case ')': - *firstbyteptr = firstbyte; - *reqbyteptr = reqbyte; - *codeptr = code; - *ptrptr = ptr; -+ if (lengthptr != NULL) -+ { -+ *lengthptr += code - last_code; /* To include callout length */ -+ DPRINTF((">> end branch\n")); -+ } - return TRUE; - -+ -+ /* ===================================================================*/ - /* Handle single-character metacharacters. In multiline mode, ^ disables - the setting of any following char as a first character. */ - -@@ -1784,6 +2270,8 @@ - *code++ = OP_ANY; - break; - -+ -+ /* ===================================================================*/ - /* Character classes. If the included characters are all < 256, we build a - 32-byte bitmap of the permitted characters, except in the special case - where there is only one such character. For negated classes, we build the -@@ -1822,32 +2310,32 @@ - } - - /* Keep a count of chars with values < 256 so that we can optimize the case -- of just a single character (as long as it's < 256). For higher valued UTF-8 -- characters, we don't yet do any optimization. */ -+ of just a single character (as long as it's < 256). However, For higher -+ valued UTF-8 characters, we don't yet do any optimization. */ - - class_charcount = 0; - class_lastchar = -1; - -+ /* Initialize the 32-char bit map to all zeros. We build the map in a -+ temporary bit of memory, in case the class contains only 1 character (less -+ than 256), because in that case the compiled code doesn't use the bit map. -+ */ -+ -+ memset(classbits, 0, 32 * sizeof(uschar)); -+ - #ifdef SUPPORT_UTF8 - class_utf8 = FALSE; /* No chars >= 256 */ -- class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */ -+ class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ - #endif - -- /* Initialize the 32-char bit map to all zeros. We have to build the -- map in a temporary bit of store, in case the class contains only 1 -- character (< 256), because in that case the compiled code doesn't use the -- bit map. */ -- -- memset(classbits, 0, 32 * sizeof(uschar)); -- - /* Process characters until ] is reached. By writing this as a "do" it -- means that an initial ] is taken as a data character. The first pass -- through the regex checked the overall syntax, so we don't need to be very -- strict here. At the start of the loop, c contains the first byte of the -- character. */ -+ means that an initial ] is taken as a data character. At the start of the -+ loop, c contains the first byte of the character. */ - -- do -+ if (c != 0) do - { -+ const uschar *oldptr; -+ - #ifdef SUPPORT_UTF8 - if (utf8 && c > 127) - { /* Braces are required because the */ -@@ -1859,13 +2347,13 @@ - - if (inescq) - { -- if (c == '\\' && ptr[1] == 'E') -+ if (c == '\\' && ptr[1] == 'E') /* If we are at \E */ - { -- inescq = FALSE; -- ptr++; -- continue; -+ inescq = FALSE; /* Reset literal state */ -+ ptr++; /* Skip the 'E' */ -+ continue; /* Carry on with next */ - } -- else goto LONE_SINGLE_CHARACTER; -+ goto CHECK_RANGE; /* Could be range if \E follows */ - } - - /* Handle POSIX class names. Perl allows a negation extension of the -@@ -1956,19 +2444,20 @@ - } - - /* Backslash may introduce a single character, or it may introduce one -- of the specials, which just set a flag. Escaped items are checked for -- validity in the pre-compiling pass. The sequence \b is a special case. -- Inside a class (and only there) it is treated as backspace. Elsewhere -- it marks a word boundary. Other escapes have preset maps ready to -- or into the one we are building. We assume they have more than one -+ of the specials, which just set a flag. The sequence \b is a special -+ case. Inside a class (and only there) it is treated as backspace. -+ Elsewhere it marks a word boundary. Other escapes have preset maps ready -+ to or into the one we are building. We assume they have more than one - character in them, so set class_charcount bigger than one. */ - - if (c == '\\') - { -- c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE); -+ c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); -+ if (*errorcodeptr != 0) goto FAILED; - - if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */ - else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */ -+ else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */ - else if (-c == ESC_Q) /* Handle start of quoted string */ - { - if (ptr[1] == '\\' && ptr[2] == 'E') -@@ -1983,7 +2472,10 @@ - { - register const uschar *cbits = cd->cbits; - class_charcount += 2; /* Greater than 1 is what matters */ -- switch (-c) -+ -+ /* Save time by not doing this in the pre-compile phase. */ -+ -+ if (lengthptr == NULL) switch (-c) - { - case ESC_d: - for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; -@@ -2011,52 +2503,91 @@ - classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ - continue; - --#ifdef SUPPORT_UCP -- case ESC_p: -- case ESC_P: -- { -- BOOL negated; -- int pdata; -- int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); -- if (ptype < 0) goto FAILED; -- class_utf8 = TRUE; -- *class_utf8data++ = ((-c == ESC_p) != negated)? -- XCL_PROP : XCL_NOTPROP; -- *class_utf8data++ = ptype; -- *class_utf8data++ = pdata; -- class_charcount -= 2; /* Not a < 256 character */ -- } -+ case ESC_E: /* Perl ignores an orphan \E */ - continue; --#endif -- -- /* Unrecognized escapes are faulted if PCRE is running in its -- strict mode. By default, for compatibility with Perl, they are -- treated as literals. */ - -- default: -- if ((options & PCRE_EXTRA) != 0) -- { -- *errorcodeptr = ERR7; -- goto FAILED; -- } -- c = *ptr; /* The final character */ -- class_charcount -= 2; /* Undo the default count from above */ -+ default: /* Not recognized; fall through */ -+ break; /* Need "default" setting to stop compiler warning. */ - } -- } - -- /* Fall through if we have a single character (c >= 0). This may be -- > 256 in UTF-8 mode. */ -+ /* In the pre-compile phase, just do the recognition. */ - -- } /* End of backslash handling */ -+ else if (c == -ESC_d || c == -ESC_D || c == -ESC_w || -+ c == -ESC_W || c == -ESC_s || c == -ESC_S) continue; -+ -+ /* We need to deal with \P and \p in both phases. */ -+ -+#ifdef SUPPORT_UCP -+ if (-c == ESC_p || -c == ESC_P) -+ { -+ BOOL negated; -+ int pdata; -+ int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); -+ if (ptype < 0) goto FAILED; -+ class_utf8 = TRUE; -+ *class_utf8data++ = ((-c == ESC_p) != negated)? -+ XCL_PROP : XCL_NOTPROP; -+ *class_utf8data++ = ptype; -+ *class_utf8data++ = pdata; -+ class_charcount -= 2; /* Not a < 256 character */ -+ continue; -+ } -+#endif -+ /* Unrecognized escapes are faulted if PCRE is running in its -+ strict mode. By default, for compatibility with Perl, they are -+ treated as literals. */ -+ -+ if ((options & PCRE_EXTRA) != 0) -+ { -+ *errorcodeptr = ERR7; -+ goto FAILED; -+ } -+ -+ class_charcount -= 2; /* Undo the default count from above */ -+ c = *ptr; /* Get the final character and fall through */ -+ } -+ -+ /* Fall through if we have a single character (c >= 0). This may be -+ greater than 256 in UTF-8 mode. */ -+ -+ } /* End of backslash handling */ - - /* A single character may be followed by '-' to form a range. However, - Perl does not permit ']' to be the end of the range. A '-' character -- here is treated as a literal. */ -+ at the end is treated as a literal. Perl ignores orphaned \E sequences -+ entirely. The code for handling \Q and \E is messy. */ -+ -+ CHECK_RANGE: -+ while (ptr[1] == '\\' && ptr[2] == 'E') -+ { -+ inescq = FALSE; -+ ptr += 2; -+ } -+ -+ oldptr = ptr; - -- if (ptr[1] == '-' && ptr[2] != ']') -+ if (!inescq && ptr[1] == '-') - { - int d; - ptr += 2; -+ while (*ptr == '\\' && ptr[1] == 'E') ptr += 2; -+ -+ /* If we hit \Q (not followed by \E) at this point, go into escaped -+ mode. */ -+ -+ while (*ptr == '\\' && ptr[1] == 'Q') -+ { -+ ptr += 2; -+ if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; } -+ inescq = TRUE; -+ break; -+ } -+ -+ if (*ptr == 0 || (!inescq && *ptr == ']')) -+ { -+ ptr = oldptr; -+ goto LONE_SINGLE_CHARACTER; -+ } - - #ifdef SUPPORT_UTF8 - if (utf8) -@@ -2071,27 +2602,34 @@ - not any of the other escapes. Perl 5.6 treats a hyphen as a literal - in such circumstances. */ - -- if (d == '\\') -+ if (!inescq && d == '\\') - { -- const uschar *oldptr = ptr; -- d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE); -+ d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); -+ if (*errorcodeptr != 0) goto FAILED; - -- /* \b is backslash; \X is literal X; any other special means the '-' -- was literal */ -+ /* \b is backslash; \X is literal X; \R is literal R; any other -+ special means the '-' was literal */ - - if (d < 0) - { - if (d == -ESC_b) d = '\b'; -- else if (d == -ESC_X) d = 'X'; else -+ else if (d == -ESC_X) d = 'X'; -+ else if (d == -ESC_R) d = 'R'; else - { -- ptr = oldptr - 2; -+ ptr = oldptr; - goto LONE_SINGLE_CHARACTER; /* A few lines below */ - } - } - } - -- /* The check that the two values are in the correct order happens in -- the pre-pass. Optimize one-character ranges */ -+ /* Check that the two values are in the correct order. Optimize -+ one-character ranges */ -+ -+ if (d < c) -+ { -+ *errorcodeptr = ERR8; -+ goto FAILED; -+ } - - if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ - -@@ -2112,9 +2650,9 @@ - #ifdef SUPPORT_UCP - if ((options & PCRE_CASELESS) != 0) - { -- int occ, ocd; -- int cc = c; -- int origd = d; -+ unsigned int occ, ocd; -+ unsigned int cc = c; -+ unsigned int origd = d; - while (get_othercase_range(&cc, origd, &occ, &ocd)) - { - if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */ -@@ -2172,7 +2710,12 @@ - ranges that lie entirely within 0-127 when there is UCP support; else - for partial ranges without UCP support. */ - -- for (; c <= d; c++) -+ class_charcount += d - c + 1; -+ class_lastchar = d; -+ -+ /* We can save a bit of time by skipping this in the pre-compile. */ -+ -+ if (lengthptr == NULL) for (; c <= d; c++) - { - classbits[c/8] |= (1 << (c&7)); - if ((options & PCRE_CASELESS) != 0) -@@ -2180,8 +2723,6 @@ - int uc = cd->fcc[c]; /* flip case */ - classbits[uc/8] |= (1 << (uc&7)); - } -- class_charcount++; /* in case a one-char range */ -- class_lastchar = c; - } - - continue; /* Go get the next char in the class */ -@@ -2205,8 +2746,8 @@ - #ifdef SUPPORT_UCP - if ((options & PCRE_CASELESS) != 0) - { -- int othercase; -- if ((othercase = _pcre_ucp_othercase(c)) >= 0) -+ unsigned int othercase; -+ if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) - { - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); -@@ -2231,10 +2772,15 @@ - } - } - -- /* Loop until ']' reached; the check for end of string happens inside the -- loop. This "while" is the end of the "do" above. */ -+ /* Loop until ']' reached. This "while" is the end of the "do" above. */ - -- while ((c = *(++ptr)) != ']' || inescq); -+ while ((c = *(++ptr)) != 0 && (c != ']' || inescq)); -+ -+ if (c == 0) /* Missing terminating ']' */ -+ { -+ *errorcodeptr = ERR6; -+ goto FAILED; -+ } - - /* If class_charcount is 1, we saw precisely one character whose value is - less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we -@@ -2298,7 +2844,7 @@ - - /* If there are characters with values > 255, we have to compile an - extended class, with its own opcode. If there are no characters < 256, -- we can omit the bitmap. */ -+ we can omit the bitmap in the actual compiled code. */ - - #ifdef SUPPORT_UTF8 - if (class_utf8) -@@ -2308,24 +2854,17 @@ - code += LINK_SIZE; - *code = negate_class? XCL_NOT : 0; - -- /* If the map is required, install it, and move on to the end of -- the extra data */ -+ /* If the map is required, move up the extra data to make room for it; -+ otherwise just move the code pointer to the end of the extra data. */ - - if (class_charcount > 0) - { - *code++ |= XCL_MAP; -+ memmove(code + 32, code, class_utf8data - code); - memcpy(code, classbits, 32); -- code = class_utf8data; -- } -- -- /* If the map is not required, slide down the extra data. */ -- -- else -- { -- int len = class_utf8data - (code + 33); -- memmove(code + 1, code + 33, len); -- code += len + 1; -+ code = class_utf8data + 32; - } -+ else code = class_utf8data; - - /* Now fill in the complete length of the item */ - -@@ -2342,7 +2881,8 @@ - if (negate_class) - { - *code++ = OP_NCLASS; -- for (c = 0; c < 32; c++) code[c] = ~classbits[c]; -+ if (lengthptr == NULL) /* Save time in the pre-compile phase */ -+ for (c = 0; c < 32; c++) code[c] = ~classbits[c]; - } - else - { -@@ -2352,6 +2892,8 @@ - code += 32; - break; - -+ -+ /* ===================================================================*/ - /* Various kinds of repeat; '{' is not necessarily a quantifier, but this - has been tested above. */ - -@@ -2419,20 +2961,6 @@ - } - else repeat_type = greedy_default; - -- /* If previous was a recursion, we need to wrap it inside brackets so that -- it can be replicated if necessary. */ -- -- if (*previous == OP_RECURSE) -- { -- memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE); -- code += 1 + LINK_SIZE; -- *previous = OP_BRA; -- PUT(previous, 1, code - previous); -- *code = OP_KET; -- PUT(code, 1, code - previous); -- code += 1 + LINK_SIZE; -- } -- - /* If previous was a character match, abolish the item and generate a - repeat item instead. If a char item has a minumum of more than one, ensure - that it is set in reqbyte - it might not be if a sequence such as x{3} is -@@ -2466,18 +2994,40 @@ - if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt; - } - -+ /* If the repetition is unlimited, it pays to see if the next thing on -+ the line is something that cannot possibly match this character. If so, -+ automatically possessifying this item gains some performance in the case -+ where the match fails. */ -+ -+ if (!possessive_quantifier && -+ repeat_max < 0 && -+ check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1, -+ options, cd)) -+ { -+ repeat_type = 0; /* Force greedy */ -+ possessive_quantifier = TRUE; -+ } -+ - goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ - } - - /* If previous was a single negated character ([^a] or similar), we use - one of the special opcodes, replacing it. The code is shared with single- - character repeats by setting opt_type to add a suitable offset into -- repeat_type. OP_NOT is currently used only for single-byte chars. */ -+ repeat_type. We can also test for auto-possessification. OP_NOT is -+ currently used only for single-byte chars. */ - - else if (*previous == OP_NOT) - { - op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ - c = previous[1]; -+ if (!possessive_quantifier && -+ repeat_max < 0 && -+ check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd)) -+ { -+ repeat_type = 0; /* Force greedy */ -+ possessive_quantifier = TRUE; -+ } - goto OUTPUT_SINGLE_REPEAT; - } - -@@ -2495,6 +3045,14 @@ - op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ - c = *previous; - -+ if (!possessive_quantifier && -+ repeat_max < 0 && -+ check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd)) -+ { -+ repeat_type = 0; /* Force greedy */ -+ possessive_quantifier = TRUE; -+ } -+ - OUTPUT_SINGLE_REPEAT: - if (*previous == OP_PROP || *previous == OP_NOTPROP) - { -@@ -2535,7 +3093,7 @@ - } - - /* A repeat minimum of 1 is optimized into some special cases. If the -- maximum is unlimited, we use OP_PLUS. Otherwise, the original item it -+ maximum is unlimited, we use OP_PLUS. Otherwise, the original item is - left in place and, if the maximum is greater than 1, we use OP_UPTO with - one less than the maximum. */ - -@@ -2588,7 +3146,8 @@ - } - - /* Else insert an UPTO if the max is greater than the min, again -- preceded by the character, for the previously inserted code. */ -+ preceded by the character, for the previously inserted code. If the -+ UPTO is just for 1 instance, we can use QUERY instead. */ - - else if (repeat_max != repeat_min) - { -@@ -2607,8 +3166,16 @@ - *code++ = prop_value; - } - repeat_max -= repeat_min; -- *code++ = OP_UPTO + repeat_type; -- PUT2INC(code, 0, repeat_max); -+ -+ if (repeat_max == 1) -+ { -+ *code++ = OP_QUERY + repeat_type; -+ } -+ else -+ { -+ *code++ = OP_UPTO + repeat_type; -+ PUT2INC(code, 0, repeat_max); -+ } - } - } - -@@ -2675,14 +3242,30 @@ - /* If previous was a bracket group, we may have to replicate it in certain - cases. */ - -- else if (*previous >= OP_BRA || *previous == OP_ONCE || -- *previous == OP_COND) -+ else if (*previous == OP_BRA || *previous == OP_CBRA || -+ *previous == OP_ONCE || *previous == OP_COND) - { - register int i; - int ketoffset = 0; - int len = code - previous; - uschar *bralink = NULL; - -+ /* Repeating a DEFINE group is pointless */ -+ -+ if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) -+ { -+ *errorcodeptr = ERR55; -+ goto FAILED; -+ } -+ -+ /* This is a paranoid check to stop integer overflow later on */ -+ -+ if (len > MAX_DUPLENGTH) -+ { -+ *errorcodeptr = ERR50; -+ goto FAILED; -+ } -+ - /* If the maximum repeat count is unlimited, find the end of the bracket - by scanning through from the start, and compute the offset back to it - from the current code pointer. There may be an OP_OPT setting following -@@ -2717,13 +3300,14 @@ - /* If the maximum is 1 or unlimited, we just have to stick in the - BRAZERO and do no more at this point. However, we do need to adjust - any OP_RECURSE calls inside the group that refer to the group itself or -- any internal group, because the offset is from the start of the whole -- regex. Temporarily terminate the pattern while doing this. */ -+ any internal or forward referenced group, because the offset is from -+ the start of the whole regex. Temporarily terminate the pattern while -+ doing this. */ - - if (repeat_max <= 1) - { - *code = OP_END; -- adjust_recurse(previous, 1, utf8, cd); -+ adjust_recurse(previous, 1, utf8, cd, save_hwm); - memmove(previous+1, previous, len); - code++; - *previous++ = OP_BRAZERO + repeat_type; -@@ -2741,7 +3325,7 @@ - { - int offset; - *code = OP_END; -- adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd); -+ adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm); - memmove(previous + 2 + LINK_SIZE, previous, len); - code += 2 + LINK_SIZE; - *previous++ = OP_BRAZERO + repeat_type; -@@ -2761,19 +3345,41 @@ - /* If the minimum is greater than zero, replicate the group as many - times as necessary, and adjust the maximum to the number of subsequent - copies that we need. If we set a first char from the group, and didn't -- set a required char, copy the latter from the former. */ -+ set a required char, copy the latter from the former. If there are any -+ forward reference subroutine calls in the group, there will be entries on -+ the workspace list; replicate these with an appropriate increment. */ - - else - { - if (repeat_min > 1) - { -- if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; -- for (i = 1; i < repeat_min; i++) -+ /* In the pre-compile phase, we don't actually do the replication. We -+ just adjust the length as if we had. */ -+ -+ if (lengthptr != NULL) -+ *lengthptr += (repeat_min - 1)*length_prevgroup; -+ -+ /* This is compiling for real */ -+ -+ else - { -- memcpy(code, previous, len); -- code += len; -+ if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; -+ for (i = 1; i < repeat_min; i++) -+ { -+ uschar *hc; -+ uschar *this_hwm = cd->hwm; -+ memcpy(code, previous, len); -+ for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) -+ { -+ PUT(cd->hwm, 0, GET(hc, 0) + len); -+ cd->hwm += LINK_SIZE; -+ } -+ save_hwm = this_hwm; -+ code += len; -+ } - } - } -+ - if (repeat_max > 0) repeat_max -= repeat_min; - } - -@@ -2781,12 +3387,27 @@ - the maximum is limited, it replicates the group in a nested fashion, - remembering the bracket starts on a stack. In the case of a zero minimum, - the first one was set up above. In all cases the repeat_max now specifies -- the number of additional copies needed. */ -+ the number of additional copies needed. Again, we must remember to -+ replicate entries on the forward reference list. */ - - if (repeat_max >= 0) - { -- for (i = repeat_max - 1; i >= 0; i--) -+ /* In the pre-compile phase, we don't actually do the replication. We -+ just adjust the length as if we had. For each repetition we must add 1 -+ to the length for BRAZERO and for all but the last repetition we must -+ add 2 + 2*LINKSIZE to allow for the nesting that occurs. */ -+ -+ if (lengthptr != NULL && repeat_max > 0) -+ *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - -+ 2 - 2*LINK_SIZE; /* Last one doesn't nest */ -+ -+ /* This is compiling for real */ -+ -+ else for (i = repeat_max - 1; i >= 0; i--) - { -+ uschar *hc; -+ uschar *this_hwm = cd->hwm; -+ - *code++ = OP_BRAZERO + repeat_type; - - /* All but the final copy start a new nesting, maintaining the -@@ -2802,6 +3423,12 @@ - } - - memcpy(code, previous, len); -+ for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) -+ { -+ PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); -+ cd->hwm += LINK_SIZE; -+ } -+ save_hwm = this_hwm; - code += len; - } - -@@ -2824,9 +3451,34 @@ - /* If the maximum is unlimited, set a repeater in the final copy. We - can't just offset backwards from the current code point, because we - don't know if there's been an options resetting after the ket. The -- correct offset was computed above. */ -+ correct offset was computed above. -+ -+ Then, when we are doing the actual compile phase, check to see whether -+ this group is a non-atomic one that could match an empty string. If so, -+ convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so -+ that runtime checking can be done. [This check is also applied to -+ atomic groups at runtime, but in a different way.] */ - -- else code[-ketoffset] = OP_KETRMAX + repeat_type; -+ else -+ { -+ uschar *ketcode = code - ketoffset; -+ uschar *bracode = ketcode - GET(ketcode, 1); -+ *ketcode = OP_KETRMAX + repeat_type; -+ if (lengthptr == NULL && *bracode != OP_ONCE) -+ { -+ uschar *scode = bracode; -+ do -+ { -+ if (could_be_empty_branch(scode, ketcode, utf8)) -+ { -+ *bracode += OP_SBRA - OP_BRA; -+ break; -+ } -+ scode += GET(scode, 1); -+ } -+ while (*scode == OP_ALT); -+ } -+ } - } - - /* Else there's some kind of shambles */ -@@ -2837,22 +3489,53 @@ - goto FAILED; - } - -- /* If the character following a repeat is '+', we wrap the entire repeated -- item inside OP_ONCE brackets. This is just syntactic sugar, taken from -- Sun's Java package. The repeated item starts at tempcode, not at previous, -- which might be the first part of a string whose (former) last char we -- repeated. However, we don't support '+' after a greediness '?'. */ -+ /* If the character following a repeat is '+', or if certain optimization -+ tests above succeeded, possessive_quantifier is TRUE. For some of the -+ simpler opcodes, there is an special alternative opcode for this. For -+ anything else, we wrap the entire repeated item inside OP_ONCE brackets. -+ The '+' notation is just syntactic sugar, taken from Sun's Java package, -+ but the special opcodes can optimize it a bit. The repeated item starts at -+ tempcode, not at previous, which might be the first part of a string whose -+ (former) last char we repeated. -+ -+ Possessifying an 'exact' quantifier has no effect, so we can ignore it. But -+ an 'upto' may follow. We skip over an 'exact' item, and then test the -+ length of what remains before proceeding. */ - - if (possessive_quantifier) - { -- int len = code - tempcode; -- memmove(tempcode + 1+LINK_SIZE, tempcode, len); -- code += 1 + LINK_SIZE; -- len += 1 + LINK_SIZE; -- tempcode[0] = OP_ONCE; -- *code++ = OP_KET; -- PUTINC(code, 0, len); -- PUT(tempcode, 1, len); -+ int len; -+ if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT || -+ *tempcode == OP_NOTEXACT) -+ tempcode += _pcre_OP_lengths[*tempcode]; -+ len = code - tempcode; -+ if (len > 0) switch (*tempcode) -+ { -+ case OP_STAR: *tempcode = OP_POSSTAR; break; -+ case OP_PLUS: *tempcode = OP_POSPLUS; break; -+ case OP_QUERY: *tempcode = OP_POSQUERY; break; -+ case OP_UPTO: *tempcode = OP_POSUPTO; break; -+ -+ case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break; -+ case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break; -+ case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break; -+ case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break; -+ -+ case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break; -+ case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break; -+ case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break; -+ case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break; -+ -+ default: -+ memmove(tempcode + 1+LINK_SIZE, tempcode, len); -+ code += 1 + LINK_SIZE; -+ len += 1 + LINK_SIZE; -+ tempcode[0] = OP_ONCE; -+ *code++ = OP_KET; -+ PUTINC(code, 0, len); -+ PUT(tempcode, 1, len); -+ break; -+ } - } - - /* In all case we no longer have a previous item. We also set the -@@ -2865,162 +3548,275 @@ - break; - - -- /* Start of nested bracket sub-expression, or comment or lookahead or -- lookbehind or option setting or condition. First deal with special things -- that can come after a bracket; all are introduced by ?, and the appearance -- of any of them means that this is not a referencing group. They were -- checked for validity in the first pass over the string, so we don't have to -- check for syntax errors here. */ -+ /* ===================================================================*/ -+ /* Start of nested parenthesized sub-expression, or comment or lookahead or -+ lookbehind or option setting or condition or all the other extended -+ parenthesis forms. First deal with the specials; all are introduced by ?, -+ and the appearance of any of them means that this is not a capturing -+ group. */ - - case '(': - newoptions = options; - skipbytes = 0; -+ bravalue = OP_CBRA; -+ save_hwm = cd->hwm; - - if (*(++ptr) == '?') - { -- int set, unset; -+ int i, set, unset, namelen; - int *optset; -+ const uschar *name; -+ uschar *slot; - - switch (*(++ptr)) - { - case '#': /* Comment; skip to ket */ - ptr++; -- while (*ptr != ')') ptr++; -+ while (*ptr != 0 && *ptr != ')') ptr++; -+ if (*ptr == 0) -+ { -+ *errorcodeptr = ERR18; -+ goto FAILED; -+ } - continue; - -- case ':': /* Non-extracting bracket */ -+ -+ /* ------------------------------------------------------------ */ -+ case ':': /* Non-capturing bracket */ - bravalue = OP_BRA; - ptr++; - break; - -+ -+ /* ------------------------------------------------------------ */ - case '(': - bravalue = OP_COND; /* Conditional group */ - -- /* A condition can be a number, referring to a numbered group, a name, -- referring to a named group, 'R', referring to recursion, or an -- assertion. There are two unfortunate ambiguities, caused by history. -- (a) 'R' can be the recursive thing or the name 'R', and (b) a number -- could be a name that consists of digits. In both cases, we look for a -- name first; if not found, we try the other cases. If the first -- character after (?( is a word character, we know the rest up to ) will -- also be word characters because the syntax was checked in the first -- pass. */ -- -- if ((cd->ctypes[ptr[1]] & ctype_word) != 0) -- { -- int i, namelen; -- int condref = 0; -- const uschar *name; -- uschar *slot = cd->name_table; -+ /* A condition can be an assertion, a number (referring to a numbered -+ group), a name (referring to a named group), or 'R', referring to -+ recursion. R<digits> and R&name are also permitted for recursion tests. -+ -+ There are several syntaxes for testing a named group: (?(name)) is used -+ by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')). -+ -+ There are two unfortunate ambiguities, caused by history. (a) 'R' can -+ be the recursive thing or the name 'R' (and similarly for 'R' followed -+ by digits), and (b) a number could be a name that consists of digits. -+ In both cases, we look for a name first; if not found, we try the other -+ cases. */ -+ -+ /* For conditions that are assertions, check the syntax, and then exit -+ the switch. This will take control down to where bracketed groups, -+ including assertions, are processed. */ - -- /* This is needed for all successful cases. */ -+ if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<')) -+ break; - -- skipbytes = 3; -+ /* Most other conditions use OP_CREF (a couple change to OP_RREF -+ below), and all need to skip 3 bytes at the start of the group. */ - -- /* Read the name, but also get it as a number if it's all digits */ -+ code[1+LINK_SIZE] = OP_CREF; -+ skipbytes = 3; - -- name = ++ptr; -- while (*ptr != ')') -- { -- if (condref >= 0) -- condref = ((digitab[*ptr] & ctype_digit) != 0)? -- condref * 10 + *ptr - '0' : -1; -- ptr++; -- } -- namelen = ptr - name; -+ /* Check for a test for recursion in a named group. */ -+ -+ if (ptr[1] == 'R' && ptr[2] == '&') -+ { -+ terminator = -1; -+ ptr += 2; -+ code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */ -+ } -+ -+ /* Check for a test for a named group's having been set, using the Perl -+ syntax (?(<name>) or (?('name') */ -+ -+ else if (ptr[1] == '<') -+ { -+ terminator = '>'; - ptr++; -+ } -+ else if (ptr[1] == '\'') -+ { -+ terminator = '\''; -+ ptr++; -+ } -+ else terminator = 0; - -- for (i = 0; i < cd->names_found; i++) -- { -- if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; -- slot += cd->name_entry_size; -- } -+ /* We now expect to read a name; any thing else is an error */ - -- /* Found a previous named subpattern */ -+ if ((cd->ctypes[ptr[1]] & ctype_word) == 0) -+ { -+ ptr += 1; /* To get the right offset */ -+ *errorcodeptr = ERR28; -+ goto FAILED; -+ } - -- if (i < cd->names_found) -- { -- condref = GET2(slot, 0); -- code[1+LINK_SIZE] = OP_CREF; -- PUT2(code, 2+LINK_SIZE, condref); -- } -+ /* Read the name, but also get it as a number if it's all digits */ - -- /* Search the pattern for a forward reference */ -+ recno = 0; -+ name = ++ptr; -+ while ((cd->ctypes[*ptr] & ctype_word) != 0) -+ { -+ if (recno >= 0) -+ recno = ((digitab[*ptr] & ctype_digit) != 0)? -+ recno * 10 + *ptr - '0' : -1; -+ ptr++; -+ } -+ namelen = ptr - name; - -- else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0) -- { -- code[1+LINK_SIZE] = OP_CREF; -- PUT2(code, 2+LINK_SIZE, i); -- } -+ if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')') -+ { -+ ptr--; /* Error offset */ -+ *errorcodeptr = ERR26; -+ goto FAILED; -+ } - -- /* Check for 'R' for recursion */ -+ /* Do no further checking in the pre-compile phase. */ - -- else if (namelen == 1 && *name == 'R') -- { -- code[1+LINK_SIZE] = OP_CREF; -- PUT2(code, 2+LINK_SIZE, CREF_RECURSE); -- } -+ if (lengthptr != NULL) break; - -- /* Check for a subpattern number */ -+ /* In the real compile we do the work of looking for the actual -+ reference. */ - -- else if (condref > 0) -- { -- code[1+LINK_SIZE] = OP_CREF; -- PUT2(code, 2+LINK_SIZE, condref); -- } -+ slot = cd->name_table; -+ for (i = 0; i < cd->names_found; i++) -+ { -+ if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; -+ slot += cd->name_entry_size; -+ } - -- /* Either an unidentified subpattern, or a reference to (?(0) */ -+ /* Found a previous named subpattern */ - -- else -+ if (i < cd->names_found) -+ { -+ recno = GET2(slot, 0); -+ PUT2(code, 2+LINK_SIZE, recno); -+ } -+ -+ /* Search the pattern for a forward reference */ -+ -+ else if ((i = find_parens(ptr, cd->bracount, name, namelen, -+ (options & PCRE_EXTENDED) != 0)) > 0) -+ { -+ PUT2(code, 2+LINK_SIZE, i); -+ } -+ -+ /* If terminator == 0 it means that the name followed directly after -+ the opening parenthesis [e.g. (?(abc)...] and in this case there are -+ some further alternatives to try. For the cases where terminator != 0 -+ [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have -+ now checked all the possibilities, so give an error. */ -+ -+ else if (terminator != 0) -+ { -+ *errorcodeptr = ERR15; -+ goto FAILED; -+ } -+ -+ /* Check for (?(R) for recursion. Allow digits after R to specify a -+ specific group number. */ -+ -+ else if (*name == 'R') -+ { -+ recno = 0; -+ for (i = 1; i < namelen; i++) - { -- *errorcodeptr = (condref == 0)? ERR35: ERR15; -- goto FAILED; -+ if ((digitab[name[i]] & ctype_digit) == 0) -+ { -+ *errorcodeptr = ERR15; -+ goto FAILED; -+ } -+ recno = recno * 10 + name[i] - '0'; - } -+ if (recno == 0) recno = RREF_ANY; -+ code[1+LINK_SIZE] = OP_RREF; /* Change test type */ -+ PUT2(code, 2+LINK_SIZE, recno); -+ } -+ -+ /* Similarly, check for the (?(DEFINE) "condition", which is always -+ false. */ -+ -+ else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0) -+ { -+ code[1+LINK_SIZE] = OP_DEF; -+ skipbytes = 1; -+ } -+ -+ /* Check for the "name" actually being a subpattern number. */ -+ -+ else if (recno > 0) -+ { -+ PUT2(code, 2+LINK_SIZE, recno); - } - -- /* For conditions that are assertions, we just fall through, having -- set bravalue above. */ -+ /* Either an unidentified subpattern, or a reference to (?(0) */ - -+ else -+ { -+ *errorcodeptr = (recno == 0)? ERR35: ERR15; -+ goto FAILED; -+ } - break; - -+ -+ /* ------------------------------------------------------------ */ - case '=': /* Positive lookahead */ - bravalue = OP_ASSERT; - ptr++; - break; - -+ -+ /* ------------------------------------------------------------ */ - case '!': /* Negative lookahead */ - bravalue = OP_ASSERT_NOT; - ptr++; - break; - -- case '<': /* Lookbehinds */ -- switch (*(++ptr)) -+ -+ /* ------------------------------------------------------------ */ -+ case '<': /* Lookbehind or named define */ -+ switch (ptr[1]) - { - case '=': /* Positive lookbehind */ - bravalue = OP_ASSERTBACK; -- ptr++; -+ ptr += 2; - break; - - case '!': /* Negative lookbehind */ - bravalue = OP_ASSERTBACK_NOT; -- ptr++; -+ ptr += 2; - break; -+ -+ default: /* Could be name define, else bad */ -+ if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME; -+ ptr++; /* Correct offset for error */ -+ *errorcodeptr = ERR24; -+ goto FAILED; - } - break; - -+ -+ /* ------------------------------------------------------------ */ - case '>': /* One-time brackets */ - bravalue = OP_ONCE; - ptr++; - break; - -+ -+ /* ------------------------------------------------------------ */ - case 'C': /* Callout - may be followed by digits; */ - previous_callout = code; /* Save for later completion */ - after_manual_callout = 1; /* Skip one item before completing */ -- *code++ = OP_CALLOUT; /* Already checked that the terminating */ -- { /* closing parenthesis is present. */ -+ *code++ = OP_CALLOUT; -+ { - int n = 0; - while ((digitab[*(++ptr)] & ctype_digit) != 0) - n = n * 10 + *ptr - '0'; -+ if (*ptr != ')') -+ { -+ *errorcodeptr = ERR39; -+ goto FAILED; -+ } - if (n > 255) - { - *errorcodeptr = ERR38; -@@ -3034,134 +3830,232 @@ - previous = NULL; - continue; - -- case 'P': /* Named subpattern handling */ -- if (*(++ptr) == '<') /* Definition */ -+ -+ /* ------------------------------------------------------------ */ -+ case 'P': /* Python-style named subpattern handling */ -+ if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */ -+ { -+ is_recurse = *ptr == '>'; -+ terminator = ')'; -+ goto NAMED_REF_OR_RECURSE; -+ } -+ else if (*ptr != '<') /* Test for Python-style definition */ -+ { -+ *errorcodeptr = ERR41; -+ goto FAILED; -+ } -+ /* Fall through to handle (?P< as (?< is handled */ -+ -+ -+ /* ------------------------------------------------------------ */ -+ DEFINE_NAME: /* Come here from (?< handling */ -+ case '\'': - { -- int i, namelen; -- uschar *slot = cd->name_table; -- const uschar *name; /* Don't amalgamate; some compilers */ -- name = ++ptr; /* grumble at autoincrement in declaration */ -+ terminator = (*ptr == '<')? '>' : '\''; -+ name = ++ptr; -+ -+ while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; -+ namelen = ptr - name; - -- while (*ptr++ != '>'); -- namelen = ptr - name - 1; -+ /* In the pre-compile phase, just do a syntax check. */ - -- for (i = 0; i < cd->names_found; i++) -+ if (lengthptr != NULL) -+ { -+ if (*ptr != terminator) -+ { -+ *errorcodeptr = ERR42; -+ goto FAILED; -+ } -+ if (cd->names_found >= MAX_NAME_COUNT) -+ { -+ *errorcodeptr = ERR49; -+ goto FAILED; -+ } -+ if (namelen + 3 > cd->name_entry_size) -+ { -+ cd->name_entry_size = namelen + 3; -+ if (namelen > MAX_NAME_SIZE) -+ { -+ *errorcodeptr = ERR48; -+ goto FAILED; -+ } -+ } -+ } -+ -+ /* In the real compile, create the entry in the table */ -+ -+ else - { -- int crc = memcmp(name, slot+2, namelen); -- if (crc == 0) -+ slot = cd->name_table; -+ for (i = 0; i < cd->names_found; i++) - { -- if (slot[2+namelen] == 0) -+ int crc = memcmp(name, slot+2, namelen); -+ if (crc == 0) - { -- if ((options & PCRE_DUPNAMES) == 0) -+ if (slot[2+namelen] == 0) - { -- *errorcodeptr = ERR43; -- goto FAILED; -+ if ((options & PCRE_DUPNAMES) == 0) -+ { -+ *errorcodeptr = ERR43; -+ goto FAILED; -+ } - } -+ else crc = -1; /* Current name is substring */ - } -- else crc = -1; /* Current name is substring */ -- } -- if (crc < 0) -- { -- memmove(slot + cd->name_entry_size, slot, -- (cd->names_found - i) * cd->name_entry_size); -- break; -+ if (crc < 0) -+ { -+ memmove(slot + cd->name_entry_size, slot, -+ (cd->names_found - i) * cd->name_entry_size); -+ break; -+ } -+ slot += cd->name_entry_size; - } -- slot += cd->name_entry_size; -- } - -- PUT2(slot, 0, *brackets + 1); -- memcpy(slot + 2, name, namelen); -- slot[2+namelen] = 0; -- cd->names_found++; -- goto NUMBERED_GROUP; -+ PUT2(slot, 0, cd->bracount + 1); -+ memcpy(slot + 2, name, namelen); -+ slot[2+namelen] = 0; -+ } - } - -- if (*ptr == '=' || *ptr == '>') /* Reference or recursion */ -- { -- int i, namelen; -- int type = *ptr++; -- const uschar *name = ptr; -- uschar *slot = cd->name_table; -+ /* In both cases, count the number of names we've encountered. */ - -- while (*ptr != ')') ptr++; -- namelen = ptr - name; -+ ptr++; /* Move past > or ' */ -+ cd->names_found++; -+ goto NUMBERED_GROUP; - -- for (i = 0; i < cd->names_found; i++) -+ -+ /* ------------------------------------------------------------ */ -+ case '&': /* Perl recursion/subroutine syntax */ -+ terminator = ')'; -+ is_recurse = TRUE; -+ /* Fall through */ -+ -+ /* We come here from the Python syntax above that handles both -+ references (?P=name) and recursion (?P>name), as well as falling -+ through from the Perl recursion syntax (?&name). */ -+ -+ NAMED_REF_OR_RECURSE: -+ name = ++ptr; -+ while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; -+ namelen = ptr - name; -+ -+ /* In the pre-compile phase, do a syntax check and set a dummy -+ reference number. */ -+ -+ if (lengthptr != NULL) -+ { -+ if (*ptr != terminator) - { -- if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; -- slot += cd->name_entry_size; -+ *errorcodeptr = ERR42; -+ goto FAILED; - } -- -- if (i < cd->names_found) /* Back reference */ -+ if (namelen > MAX_NAME_SIZE) -+ { -+ *errorcodeptr = ERR48; -+ goto FAILED; -+ } -+ recno = 0; -+ } -+ -+ /* In the real compile, seek the name in the table */ -+ -+ else -+ { -+ slot = cd->name_table; -+ for (i = 0; i < cd->names_found; i++) -+ { -+ if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; -+ slot += cd->name_entry_size; -+ } -+ -+ if (i < cd->names_found) /* Back reference */ - { - recno = GET2(slot, 0); - } - else if ((recno = /* Forward back reference */ -- find_named_parens(ptr, *brackets, name, namelen)) <= 0) -+ find_parens(ptr, cd->bracount, name, namelen, -+ (options & PCRE_EXTENDED) != 0)) <= 0) - { - *errorcodeptr = ERR15; - goto FAILED; - } -+ } - -- if (type == '>') goto HANDLE_RECURSION; /* A few lines below */ -- -- /* Back reference */ -+ /* In both phases, we can now go to the code than handles numerical -+ recursion or backreferences. */ - -- previous = code; -- *code++ = OP_REF; -- PUT2INC(code, 0, recno); -- cd->backref_map |= (recno < 32)? (1 << recno) : 1; -- if (recno > cd->top_backref) cd->top_backref = recno; -- continue; -- } -+ if (is_recurse) goto HANDLE_RECURSION; -+ else goto HANDLE_REFERENCE; - -- /* Should never happen */ -- break; - -- case 'R': /* Pattern recursion */ -+ /* ------------------------------------------------------------ */ -+ case 'R': /* Recursion */ - ptr++; /* Same as (?0) */ - /* Fall through */ - -- /* Recursion or "subroutine" call */ - -- case '0': case '1': case '2': case '3': case '4': -- case '5': case '6': case '7': case '8': case '9': -+ /* ------------------------------------------------------------ */ -+ case '0': case '1': case '2': case '3': case '4': /* Recursion or */ -+ case '5': case '6': case '7': case '8': case '9': /* subroutine */ - { - const uschar *called; - recno = 0; - while((digitab[*ptr] & ctype_digit) != 0) - recno = recno * 10 + *ptr++ - '0'; -+ if (*ptr != ')') -+ { -+ *errorcodeptr = ERR29; -+ goto FAILED; -+ } - - /* Come here from code above that handles a named recursion */ - - HANDLE_RECURSION: - - previous = code; -+ called = cd->start_code; - -- /* Find the bracket that is being referenced. Temporarily end the -- regex in case it doesn't exist. */ -+ /* When we are actually compiling, find the bracket that is being -+ referenced. Temporarily end the regex in case it doesn't exist before -+ this point. If we end up with a forward reference, first check that -+ the bracket does occur later so we can give the error (and position) -+ now. Then remember this forward reference in the workspace so it can -+ be filled in at the end. */ - -- *code = OP_END; -- called = (recno == 0)? cd->start_code : -- find_bracket(cd->start_code, utf8, recno); -- if (called == NULL) -+ if (lengthptr == NULL) - { -- *errorcodeptr = ERR15; -- goto FAILED; -- } -+ *code = OP_END; -+ if (recno != 0) called = find_bracket(cd->start_code, utf8, recno); - -- /* If the subpattern is still open, this is a recursive call. We -- check to see if this is a left recursion that could loop for ever, -- and diagnose that case. */ -+ /* Forward reference */ - -- if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8)) -- { -- *errorcodeptr = ERR40; -- goto FAILED; -+ if (called == NULL) -+ { -+ if (find_parens(ptr, cd->bracount, NULL, recno, -+ (options & PCRE_EXTENDED) != 0) < 0) -+ { -+ *errorcodeptr = ERR15; -+ goto FAILED; -+ } -+ called = cd->start_code + recno; -+ PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code); -+ } -+ -+ /* If not a forward reference, and the subpattern is still open, -+ this is a recursive call. We check to see if this is a left -+ recursion that could loop for ever, and diagnose that case. */ -+ -+ else if (GET(called, 1) == 0 && -+ could_be_empty(called, code, bcptr, utf8)) -+ { -+ *errorcodeptr = ERR40; -+ goto FAILED; -+ } - } - - /* Insert the recursion/subroutine item, automatically wrapped inside -- "once" brackets. */ -+ "once" brackets. Set up a "previous group" length so that a -+ subsequent quantifier will work. */ - - *code = OP_ONCE; - PUT(code, 1, 2 + 2*LINK_SIZE); -@@ -3174,12 +4068,18 @@ - *code = OP_KET; - PUT(code, 1, 2 + 2*LINK_SIZE); - code += 1 + LINK_SIZE; -+ -+ length_prevgroup = 3 + 3*LINK_SIZE; - } -+ -+ /* Can't determine a first byte now */ -+ -+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - continue; - -- /* Character after (? not specially recognized */ - -- default: /* Option setting */ -+ /* ------------------------------------------------------------ */ -+ default: /* Other characters: check option setting */ - set = unset = 0; - optset = &set; - -@@ -3189,13 +4089,21 @@ - { - case '-': optset = &unset; break; - -+ case 'J': /* Record that it changed in the external options */ -+ *optset |= PCRE_DUPNAMES; -+ cd->external_options |= PCRE_JCHANGED; -+ break; -+ - case 'i': *optset |= PCRE_CASELESS; break; -- case 'J': *optset |= PCRE_DUPNAMES; break; - case 'm': *optset |= PCRE_MULTILINE; break; - case 's': *optset |= PCRE_DOTALL; break; - case 'x': *optset |= PCRE_EXTENDED; break; - case 'U': *optset |= PCRE_UNGREEDY; break; - case 'X': *optset |= PCRE_EXTRA; break; -+ -+ default: *errorcodeptr = ERR12; -+ ptr--; /* Correct the offset */ -+ goto FAILED; - } - } - -@@ -3204,32 +4112,54 @@ - newoptions = (options | set) & (~unset); - - /* If the options ended with ')' this is not the start of a nested -- group with option changes, so the options change at this level. Compile -- code to change the ims options if this setting actually changes any of -- them. We also pass the new setting back so that it can be put at the -- start of any following branches, and when this group ends (if we are in -- a group), a resetting item can be compiled. -- -- Note that if this item is right at the start of the pattern, the -- options will have been abstracted and made global, so there will be no -- change to compile. */ -+ group with option changes, so the options change at this level. If this -+ item is right at the start of the pattern, the options can be -+ abstracted and made external in the pre-compile phase, and ignored in -+ the compile phase. This can be helpful when matching -- for instance in -+ caseless checking of required bytes. -+ -+ If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are -+ definitely *not* at the start of the pattern because something has been -+ compiled. In the pre-compile phase, however, the code pointer can have -+ that value after the start, because it gets reset as code is discarded -+ during the pre-compile. However, this can happen only at top level - if -+ we are within parentheses, the starting BRA will still be present. At -+ any parenthesis level, the length value can be used to test if anything -+ has been compiled at that level. Thus, a test for both these conditions -+ is necessary to ensure we correctly detect the start of the pattern in -+ both phases. -+ -+ If we are not at the pattern start, compile code to change the ims -+ options if this setting actually changes any of them. We also pass the -+ new setting back so that it can be put at the start of any following -+ branches, and when this group ends (if we are in a group), a resetting -+ item can be compiled. */ - - if (*ptr == ')') - { -- if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) -+ if (code == cd->start_code + 1 + LINK_SIZE && -+ (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) - { -- *code++ = OP_OPT; -- *code++ = newoptions & PCRE_IMS; -+ cd->external_options = newoptions; -+ options = newoptions; - } -+ else -+ { -+ if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) -+ { -+ *code++ = OP_OPT; -+ *code++ = newoptions & PCRE_IMS; -+ } - -- /* Change options at this level, and pass them back for use -- in subsequent branches. Reset the greedy defaults and the case -- value for firstbyte and reqbyte. */ -- -- *optionsptr = options = newoptions; -- greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); -- greedy_non_default = greedy_default ^ 1; -- req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; -+ /* Change options at this level, and pass them back for use -+ in subsequent branches. Reset the greedy defaults and the case -+ value for firstbyte and reqbyte. */ -+ -+ *optionsptr = options = newoptions; -+ greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); -+ greedy_non_default = greedy_default ^ 1; -+ req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; -+ } - - previous = NULL; /* This item can't be repeated */ - continue; /* It is complete */ -@@ -3242,58 +4172,56 @@ - - bravalue = OP_BRA; - ptr++; -- } -- } -+ } /* End of switch for character following (? */ -+ } /* End of (? handling */ - -- /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become -- non-capturing and behave like (?:...) brackets */ -+ /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set, -+ all unadorned brackets become non-capturing and behave like (?:...) -+ brackets. */ - - else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) - { - bravalue = OP_BRA; - } - -- /* Else we have a referencing group; adjust the opcode. If the bracket -- number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and -- arrange for the true number to follow later, in an OP_BRANUMBER item. */ -+ /* Else we have a capturing group. */ - - else - { - NUMBERED_GROUP: -- if (++(*brackets) > EXTRACT_BASIC_MAX) -- { -- bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1; -- code[1+LINK_SIZE] = OP_BRANUMBER; -- PUT2(code, 2+LINK_SIZE, *brackets); -- skipbytes = 3; -- } -- else bravalue = OP_BRA + *brackets; -+ cd->bracount += 1; -+ PUT2(code, 1+LINK_SIZE, cd->bracount); -+ skipbytes = 2; - } - -- /* Process nested bracketed re. Assertions may not be repeated, but other -- kinds can be. We copy code into a non-register variable in order to be able -- to pass its address because some compilers complain otherwise. Pass in a -- new setting for the ims options if they have changed. */ -+ /* Process nested bracketed regex. Assertions may not be repeated, but -+ other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a -+ non-register variable in order to be able to pass its address because some -+ compilers complain otherwise. Pass in a new setting for the ims options if -+ they have changed. */ - - previous = (bravalue >= OP_ONCE)? code : NULL; - *code = bravalue; - tempcode = code; - tempreqvary = cd->req_varyopt; /* Save value before bracket */ -+ length_prevgroup = 0; /* Initialize for pre-compile phase */ - - if (!compile_regex( - newoptions, /* The complete new option state */ - options & PCRE_IMS, /* The previous ims option state */ -- brackets, /* Extracting bracket count */ - &tempcode, /* Where to put code (updated) */ - &ptr, /* Input pointer (updated) */ - errorcodeptr, /* Where to put an error message */ - (bravalue == OP_ASSERTBACK || - bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ -- skipbytes, /* Skip over OP_COND/OP_BRANUMBER */ -+ skipbytes, /* Skip over bracket number */ - &subfirstbyte, /* For possible first char */ - &subreqbyte, /* For possible last char */ - bcptr, /* Current branch chain */ -- cd)) /* Tables block */ -+ cd, /* Tables block */ -+ (lengthptr == NULL)? NULL : /* Actual compile phase */ -+ &length_prevgroup /* Pre-compile phase */ -+ )) - goto FAILED; - - /* At the end of compiling, code is still pointing to the start of the -@@ -3302,9 +4230,9 @@ - is on the bracket. */ - - /* If this is a conditional bracket, check that there are no more than -- two branches in the group. */ -+ two branches in the group, or just one if it's a DEFINE group. */ - -- else if (bravalue == OP_COND) -+ if (bravalue == OP_COND) - { - uschar *tc = code; - int condcount = 0; -@@ -3315,29 +4243,77 @@ - } - while (*tc != OP_KET); - -- if (condcount > 2) -+ /* A DEFINE group is never obeyed inline (the "condition" is always -+ false). It must have only one branch. */ -+ -+ if (code[LINK_SIZE+1] == OP_DEF) - { -- *errorcodeptr = ERR27; -- goto FAILED; -+ if (condcount > 1) -+ { -+ *errorcodeptr = ERR54; -+ goto FAILED; -+ } -+ bravalue = OP_DEF; /* Just a flag to suppress char handling below */ -+ } -+ -+ /* A "normal" conditional group. If there is just one branch, we must not -+ make use of its firstbyte or reqbyte, because this is equivalent to an -+ empty second branch. */ -+ -+ else -+ { -+ if (condcount > 2) -+ { -+ *errorcodeptr = ERR27; -+ goto FAILED; -+ } -+ if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; - } -+ } -+ -+ /* Error if hit end of pattern */ - -- /* If there is just one branch, we must not make use of its firstbyte or -- reqbyte, because this is equivalent to an empty second branch. */ -+ if (*ptr != ')') -+ { -+ *errorcodeptr = ERR14; -+ goto FAILED; -+ } - -- if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; -+ /* In the pre-compile phase, update the length by the length of the nested -+ group, less the brackets at either end. Then reduce the compiled code to -+ just the brackets so that it doesn't use much memory if it is duplicated by -+ a quantifier. */ -+ -+ if (lengthptr != NULL) -+ { -+ *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; -+ code++; -+ PUTINC(code, 0, 1 + LINK_SIZE); -+ *code++ = OP_KET; -+ PUTINC(code, 0, 1 + LINK_SIZE); - } - -- /* Handle updating of the required and first characters. Update for normal -- brackets of all kinds, and conditions with two branches (see code above). -- If the bracket is followed by a quantifier with zero repeat, we have to -- back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the -- main loop so that they can be accessed for the back off. */ -+ /* Otherwise update the main code pointer to the end of the group. */ -+ -+ else code = tempcode; -+ -+ /* For a DEFINE group, required and first character settings are not -+ relevant. */ -+ -+ if (bravalue == OP_DEF) break; -+ -+ /* Handle updating of the required and first characters for other types of -+ group. Update for normal brackets of all kinds, and conditions with two -+ branches (see code above). If the bracket is followed by a quantifier with -+ zero repeat, we have to back off. Hence the definition of zeroreqbyte and -+ zerofirstbyte outside the main loop so that they can be accessed for the -+ back off. */ - - zeroreqbyte = reqbyte; - zerofirstbyte = firstbyte; - groupsetfirstbyte = FALSE; - -- if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND) -+ if (bravalue >= OP_ONCE) - { - /* If we have not yet set a firstbyte in this branch, take it from the - subpattern, remembering that it was set here so that a repeat of more -@@ -3378,35 +4354,22 @@ - firstbyte, looking for an asserted first char. */ - - else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte; -+ break; /* End of processing '(' */ - -- /* Now update the main code pointer to the end of the group. */ -- -- code = tempcode; -- -- /* Error if hit end of pattern */ -- -- if (*ptr != ')') -- { -- *errorcodeptr = ERR14; -- goto FAILED; -- } -- break; -- -- /* Check \ for being a real metacharacter; if not, fall through and handle -- it as a data character at the start of a string. Escape items are checked -- for validity in the pre-compiling pass. */ -- -- case '\\': -- tempptr = ptr; -- c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE); - -- /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values -+ /* ===================================================================*/ -+ /* Handle metasequences introduced by \. For ones like \d, the ESC_ values - are arranged to be the negation of the corresponding OP_values. For the - back references, the values are ESC_REF plus the reference number. Only - back references and those types that consume a character may be repeated. - We can test for values between ESC_b and ESC_Z for the latter; this may - have to change if any new ones are ever created. */ - -+ case '\\': -+ tempptr = ptr; -+ c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); -+ if (*errorcodeptr != 0) goto FAILED; -+ - if (c < 0) - { - if (-c == ESC_Q) /* Handle start of quoted string */ -@@ -3416,6 +4379,8 @@ - continue; - } - -+ if (-c == ESC_E) continue; /* Perl ignores an orphan \E */ -+ - /* For metasequences that actually match a character, we disable the - setting of a first character if it hasn't already been set. */ - -@@ -3427,18 +4392,33 @@ - zerofirstbyte = firstbyte; - zeroreqbyte = reqbyte; - -- /* Back references are handled specially */ -+ /* \k<name> or \k'name' is a back reference by name (Perl syntax) */ -+ -+ if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'')) -+ { -+ is_recurse = FALSE; -+ terminator = (*(++ptr) == '<')? '>' : '\''; -+ goto NAMED_REF_OR_RECURSE; -+ } -+ -+ /* Back references are handled specially; must disable firstbyte if -+ not set to cope with cases like (?=(\w+))\1: which would otherwise set -+ ':' later. */ - - if (-c >= ESC_REF) - { -- int number = -c - ESC_REF; -+ recno = -c - ESC_REF; -+ -+ HANDLE_REFERENCE: /* Come here from named backref handling */ -+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - previous = code; - *code++ = OP_REF; -- PUT2INC(code, 0, number); -+ PUT2INC(code, 0, recno); -+ cd->backref_map |= (recno < 32)? (1 << recno) : 1; -+ if (recno > cd->top_backref) cd->top_backref = recno; - } - -- /* So are Unicode property matches, if supported. We know that get_ucp -- won't fail because it was tested in the pre-pass. */ -+ /* So are Unicode property matches, if supported. */ - - #ifdef SUPPORT_UCP - else if (-c == ESC_P || -c == ESC_p) -@@ -3446,15 +4426,26 @@ - BOOL negated; - int pdata; - int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); -+ if (ptype < 0) goto FAILED; - previous = code; - *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; - *code++ = ptype; - *code++ = pdata; - } -+#else -+ -+ /* If Unicode properties are not supported, \X, \P, and \p are not -+ allowed. */ -+ -+ else if (-c == ESC_X || -c == ESC_P || -c == ESC_p) -+ { -+ *errorcodeptr = ERR45; -+ goto FAILED; -+ } - #endif - -- /* For the rest, we can obtain the OP value by negating the escape -- value */ -+ /* For the rest (including \X when Unicode properties are supported), we -+ can obtain the OP value by negating the escape value. */ - - else - { -@@ -3478,9 +4469,10 @@ - mcbuffer[0] = c; - mclength = 1; - } -- - goto ONE_CHAR; - -+ -+ /* ===================================================================*/ - /* Handle a literal character. It is guaranteed not to be whitespace or # - when the extended flag is set. If we are in UTF-8 mode, it may be a - multi-byte literal character. */ -@@ -3491,7 +4483,7 @@ - mcbuffer[0] = c; - - #ifdef SUPPORT_UTF8 -- if (utf8 && (c & 0xc0) == 0xc0) -+ if (utf8 && c >= 0xc0) - { - while ((ptr[1] & 0xc0) == 0x80) - mcbuffer[mclength++] = *(++ptr); -@@ -3542,6 +4534,7 @@ - } - } /* end of big loop */ - -+ - /* Control never reaches here by falling through, only by a goto for all the - error states. Pass back the position in the pattern so that it can be displayed - to the user for diagnosing the error. */ -@@ -3558,35 +4551,40 @@ - * Compile sequence of alternatives * - *************************************************/ - --/* On entry, ptr is pointing past the bracket character, but on return --it points to the closing bracket, or vertical bar, or end of string. --The code variable is pointing at the byte into which the BRA operator has been --stored. If the ims options are changed at the start (for a (?ims: group) or --during any branch, we need to insert an OP_OPT item at the start of every --following branch to ensure they get set correctly at run time, and also pass --the new options into every subsequent branch compile. -+/* On entry, ptr is pointing past the bracket character, but on return it -+points to the closing bracket, or vertical bar, or end of string. The code -+variable is pointing at the byte into which the BRA operator has been stored. -+If the ims options are changed at the start (for a (?ims: group) or during any -+branch, we need to insert an OP_OPT item at the start of every following branch -+to ensure they get set correctly at run time, and also pass the new options -+into every subsequent branch compile. -+ -+This function is used during the pre-compile phase when we are trying to find -+out the amount of memory needed, as well as during the real compile phase. The -+value of lengthptr distinguishes the two phases. - - Argument: - options option bits, including any changes for this subpattern - oldims previous settings of ims option bits -- brackets -> int containing the number of extracting brackets used - codeptr -> the address of the current code pointer - ptrptr -> the address of the current pattern pointer - errorcodeptr -> pointer to error code variable - lookbehind TRUE if this is a lookbehind assertion -- skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER) -+ skipbytes skip this many bytes at start (for brackets and OP_COND) - firstbyteptr place to put the first required character, or a negative number - reqbyteptr place to put the last required character, or a negative number - bcptr pointer to the chain of currently open branches - cd points to the data block with tables pointers etc. -+ lengthptr NULL during the real compile phase -+ points to length accumulator during pre-compile phase - --Returns: TRUE on success -+Returns: TRUE on success - */ - - static BOOL --compile_regex(int options, int oldims, int *brackets, uschar **codeptr, -- const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes, -- int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd) -+compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr, -+ int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr, -+ int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr) - { - const uschar *ptr = *ptrptr; - uschar *code = *codeptr; -@@ -3595,6 +4593,7 @@ - uschar *reverse_count = NULL; - int firstbyte, reqbyte; - int branchfirstbyte, branchreqbyte; -+int length; - branch_chain bc; - - bc.outer = bcptr; -@@ -3602,6 +4601,20 @@ - - firstbyte = reqbyte = REQ_UNSET; - -+/* Accumulate the length for use in the pre-compile phase. Start with the -+length of the BRA and KET and any extra bytes that are required at the -+beginning. We accumulate in a local variable to save frequent testing of -+lenthptr for NULL. We cannot do this by looking at the value of code at the -+start and end of each alternative, because compiled items are discarded during -+the pre-compile phase so that the work space is not exceeded. */ -+ -+length = 2 + 2*LINK_SIZE + skipbytes; -+ -+/* WARNING: If the above line is changed for any reason, you must also change -+the code that abstracts option settings at the start of the pattern and makes -+them global. It tests the value of length for (2 + 2*LINK_SIZE) in the -+pre-compile phase to find out whether anything has yet been compiled or not. */ -+ - /* Offset is set zero to mark that this bracket is still open */ - - PUT(code, 1, 0); -@@ -3617,6 +4630,7 @@ - { - *code++ = OP_OPT; - *code++ = options & PCRE_IMS; -+ length += 2; - } - - /* Set up dummy OP_REVERSE if lookbehind assertion */ -@@ -3626,73 +4640,80 @@ - *code++ = OP_REVERSE; - reverse_count = code; - PUTINC(code, 0, 0); -+ length += 1 + LINK_SIZE; - } - -- /* Now compile the branch */ -+ /* Now compile the branch; in the pre-compile phase its length gets added -+ into the length. */ - -- if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr, -- &branchfirstbyte, &branchreqbyte, &bc, cd)) -+ if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte, -+ &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length)) - { - *ptrptr = ptr; - return FALSE; - } - -- /* If this is the first branch, the firstbyte and reqbyte values for the -- branch become the values for the regex. */ -+ /* In the real compile phase, there is some post-processing to be done. */ - -- if (*last_branch != OP_ALT) -+ if (lengthptr == NULL) - { -- firstbyte = branchfirstbyte; -- reqbyte = branchreqbyte; -- } -+ /* If this is the first branch, the firstbyte and reqbyte values for the -+ branch become the values for the regex. */ - -- /* If this is not the first branch, the first char and reqbyte have to -- match the values from all the previous branches, except that if the previous -- value for reqbyte didn't have REQ_VARY set, it can still match, and we set -- REQ_VARY for the regex. */ -+ if (*last_branch != OP_ALT) -+ { -+ firstbyte = branchfirstbyte; -+ reqbyte = branchreqbyte; -+ } - -- else -- { -- /* If we previously had a firstbyte, but it doesn't match the new branch, -- we have to abandon the firstbyte for the regex, but if there was previously -- no reqbyte, it takes on the value of the old firstbyte. */ -+ /* If this is not the first branch, the first char and reqbyte have to -+ match the values from all the previous branches, except that if the -+ previous value for reqbyte didn't have REQ_VARY set, it can still match, -+ and we set REQ_VARY for the regex. */ - -- if (firstbyte >= 0 && firstbyte != branchfirstbyte) -+ else - { -- if (reqbyte < 0) reqbyte = firstbyte; -- firstbyte = REQ_NONE; -- } -+ /* If we previously had a firstbyte, but it doesn't match the new branch, -+ we have to abandon the firstbyte for the regex, but if there was -+ previously no reqbyte, it takes on the value of the old firstbyte. */ -+ -+ if (firstbyte >= 0 && firstbyte != branchfirstbyte) -+ { -+ if (reqbyte < 0) reqbyte = firstbyte; -+ firstbyte = REQ_NONE; -+ } - -- /* If we (now or from before) have no firstbyte, a firstbyte from the -- branch becomes a reqbyte if there isn't a branch reqbyte. */ -+ /* If we (now or from before) have no firstbyte, a firstbyte from the -+ branch becomes a reqbyte if there isn't a branch reqbyte. */ - -- if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) -- branchreqbyte = branchfirstbyte; -+ if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) -+ branchreqbyte = branchfirstbyte; - -- /* Now ensure that the reqbytes match */ -+ /* Now ensure that the reqbytes match */ - -- if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) -- reqbyte = REQ_NONE; -- else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ -- } -+ if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) -+ reqbyte = REQ_NONE; -+ else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ -+ } - -- /* If lookbehind, check that this branch matches a fixed-length string, -- and put the length into the OP_REVERSE item. Temporarily mark the end of -- the branch with OP_END. */ -+ /* If lookbehind, check that this branch matches a fixed-length string, and -+ put the length into the OP_REVERSE item. Temporarily mark the end of the -+ branch with OP_END. */ - -- if (lookbehind) -- { -- int length; -- *code = OP_END; -- length = find_fixedlength(last_branch, options); -- DPRINTF(("fixed length = %d\n", length)); -- if (length < 0) -+ if (lookbehind) - { -- *errorcodeptr = (length == -2)? ERR36 : ERR25; -- *ptrptr = ptr; -- return FALSE; -+ int fixed_length; -+ *code = OP_END; -+ fixed_length = find_fixedlength(last_branch, options); -+ DPRINTF(("fixed length = %d\n", fixed_length)); -+ if (fixed_length < 0) -+ { -+ *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25; -+ *ptrptr = ptr; -+ return FALSE; -+ } -+ PUT(reverse_count, 0, fixed_length); - } -- PUT(reverse_count, 0, length); - } - - /* Reached end of expression, either ')' or end of pattern. Go back through -@@ -3706,15 +4727,15 @@ - - if (*ptr != '|') - { -- int length = code - last_branch; -+ int branch_length = code - last_branch; - do - { - int prev_length = GET(last_branch, 1); -- PUT(last_branch, 1, length); -- length = prev_length; -- last_branch -= length; -+ PUT(last_branch, 1, branch_length); -+ branch_length = prev_length; -+ last_branch -= branch_length; - } -- while (length > 0); -+ while (branch_length > 0); - - /* Fill in the ket */ - -@@ -3728,6 +4749,7 @@ - { - *code++ = OP_OPT; - *code++ = oldims; -+ length += 2; - } - - /* Set values to pass back */ -@@ -3736,6 +4758,7 @@ - *ptrptr = ptr; - *firstbyteptr = firstbyte; - *reqbyteptr = reqbyte; -+ if (lengthptr != NULL) *lengthptr += length; - return TRUE; - } - -@@ -3749,6 +4772,7 @@ - bc.current = last_branch = code; - code += 1 + LINK_SIZE; - ptr++; -+ length += 1 + LINK_SIZE; - } - /* Control never reaches here */ - } -@@ -3799,24 +4823,29 @@ - unsigned int backref_map) - { - do { -- const uschar *scode = -- first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE); -+ const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], -+ options, PCRE_MULTILINE, FALSE); - register int op = *scode; - -+ /* Non-capturing brackets */ -+ -+ if (op == OP_BRA) -+ { -+ if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; -+ } -+ - /* Capturing brackets */ - -- if (op > OP_BRA) -+ else if (op == OP_CBRA) - { -- int new_map; -- op -= OP_BRA; -- if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE); -- new_map = bracket_map | ((op < 32)? (1 << op) : 1); -+ int n = GET2(scode, 1+LINK_SIZE); -+ int new_map = bracket_map | ((n < 32)? (1 << n) : 1); - if (!is_anchored(scode, options, new_map, backref_map)) return FALSE; - } - - /* Other brackets */ - -- else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) -+ else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) - { - if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; - } -@@ -3824,7 +4853,8 @@ - /* .* is not anchored unless DOTALL is set and it isn't in brackets that - are or may be referenced. */ - -- else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) && -+ else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || -+ op == OP_TYPEPOSSTAR) && - (*options & PCRE_DOTALL) != 0) - { - if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; -@@ -3869,30 +4899,35 @@ - unsigned int backref_map) - { - do { -- const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0, -- FALSE); -+ const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], -+ NULL, 0, FALSE); - register int op = *scode; - -+ /* Non-capturing brackets */ -+ -+ if (op == OP_BRA) -+ { -+ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; -+ } -+ - /* Capturing brackets */ - -- if (op > OP_BRA) -+ else if (op == OP_CBRA) - { -- int new_map; -- op -= OP_BRA; -- if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE); -- new_map = bracket_map | ((op < 32)? (1 << op) : 1); -+ int n = GET2(scode, 1+LINK_SIZE); -+ int new_map = bracket_map | ((n < 32)? (1 << n) : 1); - if (!is_startline(scode, new_map, backref_map)) return FALSE; - } - - /* Other brackets */ - -- else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) -+ else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) - { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; } - - /* .* means "start at start or after \n" if it isn't in brackets that - may be referenced. */ - -- else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR) -+ else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) - { - if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; - } -@@ -3941,14 +4976,13 @@ - first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE); - register int op = *scode; - -- if (op >= OP_BRA) op = OP_BRA; -- - switch(op) - { - default: - return -1; - - case OP_BRA: -+ case OP_CBRA: - case OP_ASSERT: - case OP_ONCE: - case OP_COND: -@@ -3964,6 +4998,7 @@ - case OP_CHARNC: - case OP_PLUS: - case OP_MINPLUS: -+ case OP_POSPLUS: - if (!inassert) return -1; - if (c < 0) - { -@@ -4012,37 +5047,36 @@ - } - - -- - PCRE_DATA_SCOPE pcre * - pcre_compile2(const char *pattern, int options, int *errorcodeptr, - const char **errorptr, int *erroroffset, const unsigned char *tables) - { - real_pcre *re; --int length = 1 + LINK_SIZE; /* For initial BRA plus length */ --int c, firstbyte, reqbyte, newline; --int bracount = 0; --int branch_extra = 0; --int branch_newextra; --int item_count = -1; --int name_count = 0; --int max_name_size = 0; --int lastitemlength = 0; -+int length = 1; /* For final END opcode */ -+int firstbyte, reqbyte, newline; - int errorcode = 0; - #ifdef SUPPORT_UTF8 - BOOL utf8; --BOOL class_utf8; - #endif --BOOL inescq = FALSE; --BOOL capturing; --unsigned int brastackptr = 0; - size_t size; - uschar *code; - const uschar *codestart; - const uschar *ptr; - compile_data compile_block; - compile_data *cd = &compile_block; --int brastack[BRASTACK_SIZE]; --uschar bralenstack[BRASTACK_SIZE]; -+ -+/* This space is used for "compiling" into during the first phase, when we are -+computing the amount of memory that is needed. Compiled items are thrown away -+as soon as possible, so that a fairly large buffer should be sufficient for -+this purpose. The same space is used in the second phase for remembering where -+to fill in forward references to subpatterns. */ -+ -+uschar cworkspace[COMPILE_WORK_SIZE]; -+ -+ -+/* Set this early so that early errors get offset 0. */ -+ -+ptr = (const uschar *)pattern; - - /* We can't pass back an error message if errorptr is NULL; I guess the best we - can do is just return NULL, but we can set a code value if there is a code -@@ -4075,7 +5109,7 @@ - (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) - { - errorcode = ERR44; -- goto PCRE_EARLY_ERROR_RETURN; -+ goto PCRE_UTF8_ERROR_RETURN; - } - #else - if ((options & PCRE_UTF8) != 0) -@@ -4099,34 +5133,43 @@ - cd->cbits = tables + cbits_offset; - cd->ctypes = tables + ctypes_offset; - --/* Handle different types of newline. The two bits give four cases. The current --code allows for one- or two-byte sequences. */ -+/* Handle different types of newline. The three bits give seven cases. The -+current code allows for fixed one- or two-byte sequences, plus "any". */ - --switch (options & PCRE_NEWLINE_CRLF) -+switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY)) - { -- default: newline = NEWLINE; break; /* Compile-time default */ -+ case 0: newline = NEWLINE; break; /* Compile-time default */ - case PCRE_NEWLINE_CR: newline = '\r'; break; - case PCRE_NEWLINE_LF: newline = '\n'; break; - case PCRE_NEWLINE_CR+ - PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; -+ case PCRE_NEWLINE_ANY: newline = -1; break; -+ default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; - } - --if (newline > 255) -+if (newline < 0) - { -- cd->nllen = 2; -- cd->nl[0] = (newline >> 8) & 255; -- cd->nl[1] = newline & 255; -+ cd->nltype = NLTYPE_ANY; - } - else - { -- cd->nllen = 1; -- cd->nl[0] = newline; -+ cd->nltype = NLTYPE_FIXED; -+ if (newline > 255) -+ { -+ cd->nllen = 2; -+ cd->nl[0] = (newline >> 8) & 255; -+ cd->nl[1] = newline & 255; -+ } -+ else -+ { -+ cd->nllen = 1; -+ cd->nl[0] = newline; -+ } - } - --/* Maximum back reference and backref bitmap. This is updated for numeric --references during the first pass, but for named references during the actual --compile pass. The bitmap records up to 31 back references to help in deciding --whether (.*) can be treated as anchored or not. */ -+/* Maximum back reference and backref bitmap. The bitmap records up to 31 back -+references to help in deciding whether (.*) can be treated as anchored or not. -+*/ - - cd->top_backref = 0; - cd->backref_map = 0; -@@ -4136,1041 +5179,151 @@ - DPRINTF(("------------------------------------------------------------------\n")); - DPRINTF(("%s\n", pattern)); - --/* The first thing to do is to make a pass over the pattern to compute the --amount of store required to hold the compiled code. This does not have to be --perfect as long as errors are overestimates. At the same time we can detect any --flag settings right at the start, and extract them. Make an attempt to correct --for any counted white space if an "extended" flag setting appears late in the --pattern. We can't be so clever for #-comments. */ -- --ptr = (const uschar *)(pattern - 1); --while ((c = *(++ptr)) != 0) -- { -- int min, max; -- int class_optcount; -- int bracket_length; -- int duplength; -+/* Pretend to compile the pattern while actually just accumulating the length -+of memory required. This behaviour is triggered by passing a non-NULL final -+argument to compile_regex(). We pass a block of workspace (cworkspace) for it -+to compile parts of the pattern into; the compiled code is discarded when it is -+no longer needed, so hopefully this workspace will never overflow, though there -+is a test for its doing so. */ - -- /* If we are inside a \Q...\E sequence, all chars are literal */ -+cd->bracount = 0; -+cd->names_found = 0; -+cd->name_entry_size = 0; -+cd->name_table = NULL; -+cd->start_workspace = cworkspace; -+cd->start_code = cworkspace; -+cd->hwm = cworkspace; -+cd->start_pattern = (const uschar *)pattern; -+cd->end_pattern = (const uschar *)(pattern + strlen(pattern)); -+cd->req_varyopt = 0; -+cd->nopartial = FALSE; -+cd->external_options = options; - -- if (inescq) -- { -- if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE; -- goto NORMAL_CHAR; -- } -+/* Now do the pre-compile. On error, errorcode will be set non-zero, so we -+don't need to look at the result of the function here. The initial options have -+been put into the cd block so that they can be changed if an option setting is -+found within the regex right at the beginning. Bringing initial option settings -+outside can help speed up starting point checks. */ - -- /* Otherwise, first check for ignored whitespace and comments */ -+code = cworkspace; -+*code = OP_BRA; -+(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS, -+ &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length); -+if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; - -- if ((options & PCRE_EXTENDED) != 0) -- { -- if ((cd->ctypes[c] & ctype_space) != 0) continue; -- if (c == '#') -- { -- while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break; -- if (*ptr != 0) -- { -- ptr += cd->nllen - 1; -- continue; -- } -- break; /* End loop at end of pattern */ -- } -- } -+DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, -+ cd->hwm - cworkspace)); - -- item_count++; /* Is zero for the first non-comment item */ -+if (length > MAX_PATTERN_SIZE) -+ { -+ errorcode = ERR20; -+ goto PCRE_EARLY_ERROR_RETURN; -+ } - -- /* Allow space for auto callout before every item except quantifiers. */ -+/* Compute the size of data block needed and get it, either from malloc or -+externally provided function. Integer overflow should no longer be possible -+because nowadays we limit the maximum value of cd->names_found and -+cd->name_entry_size. */ - -- if ((options & PCRE_AUTO_CALLOUT) != 0 && -- c != '*' && c != '+' && c != '?' && -- (c != '{' || !is_counted_repeat(ptr + 1))) -- length += 2 + 2*LINK_SIZE; -+size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3); -+re = (real_pcre *)(pcre_malloc)(size); - -- switch(c) -- { -- /* A backslashed item may be an escaped data character or it may be a -- character type. */ -+if (re == NULL) -+ { -+ errorcode = ERR21; -+ goto PCRE_EARLY_ERROR_RETURN; -+ } - -- case '\\': -- c = check_escape(&ptr, &errorcode, bracount, options, FALSE); -- if (errorcode != 0) goto PCRE_ERROR_RETURN; -+/* Put in the magic number, and save the sizes, initial options, and character -+table pointer. NULL is used for the default character tables. The nullpad field -+is at the end; it's there to help in the case when a regex compiled on a system -+with 4-byte pointers is run on another with 8-byte pointers. */ - -- lastitemlength = 1; /* Default length of last item for repeats */ -+re->magic_number = MAGIC_NUMBER; -+re->size = size; -+re->options = cd->external_options; -+re->dummy1 = 0; -+re->first_byte = 0; -+re->req_byte = 0; -+re->name_table_offset = sizeof(real_pcre); -+re->name_entry_size = cd->name_entry_size; -+re->name_count = cd->names_found; -+re->ref_count = 0; -+re->tables = (tables == _pcre_default_tables)? NULL : tables; -+re->nullpad = NULL; - -- if (c >= 0) /* Data character */ -- { -- length += 2; /* For a one-byte character */ -+/* The starting points of the name/number translation table and of the code are -+passed around in the compile data block. The start/end pattern and initial -+options are already set from the pre-compile phase, as is the name_entry_size -+field. Reset the bracket count and the names_found field. Also reset the hwm -+field; this time it's used for remembering forward references to subpatterns. -+*/ - --#ifdef SUPPORT_UTF8 -- if (utf8 && c > 127) -- { -- int i; -- for (i = 0; i < _pcre_utf8_table1_size; i++) -- if (c <= _pcre_utf8_table1[i]) break; -- length += i; -- lastitemlength += i; -- } --#endif -+cd->bracount = 0; -+cd->names_found = 0; -+cd->name_table = (uschar *)re + re->name_table_offset; -+codestart = cd->name_table + re->name_entry_size * re->name_count; -+cd->start_code = codestart; -+cd->hwm = cworkspace; -+cd->req_varyopt = 0; -+cd->nopartial = FALSE; - -- continue; -- } -+/* Set up a starting, non-extracting bracket, then compile the expression. On -+error, errorcode will be set non-zero, so we don't need to look at the result -+of the function here. */ - -- /* If \Q, enter "literal" mode */ -+ptr = (const uschar *)pattern; -+code = (uschar *)codestart; -+*code = OP_BRA; -+(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr, -+ &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); -+re->top_bracket = cd->bracount; -+re->top_backref = cd->top_backref; - -- if (-c == ESC_Q) -- { -- inescq = TRUE; -- continue; -- } -+if (cd->nopartial) re->options |= PCRE_NOPARTIAL; - -- /* \X is supported only if Unicode property support is compiled */ -+/* If not reached end of pattern on success, there's an excess bracket. */ - --#ifndef SUPPORT_UCP -- if (-c == ESC_X) -- { -- errorcode = ERR45; -- goto PCRE_ERROR_RETURN; -- } --#endif -+if (errorcode == 0 && *ptr != 0) errorcode = ERR22; - -- /* \P and \p are for Unicode properties, but only when the support has -- been compiled. Each item needs 3 bytes. */ -+/* Fill in the terminating state and check for disastrous overflow, but -+if debugging, leave the test till after things are printed out. */ - -- else if (-c == ESC_P || -c == ESC_p) -- { --#ifdef SUPPORT_UCP -- BOOL negated; -- BOOL pdata; -- length += 3; -- lastitemlength = 3; -- if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0) -- goto PCRE_ERROR_RETURN; -- continue; --#else -- errorcode = ERR45; -- goto PCRE_ERROR_RETURN; -+*code++ = OP_END; -+ -+#ifndef DEBUG -+if (code - codestart > length) errorcode = ERR23; - #endif -- } - -- /* Other escapes need one byte */ -+/* Fill in any forward references that are required. */ - -- length++; -+while (errorcode == 0 && cd->hwm > cworkspace) -+ { -+ int offset, recno; -+ const uschar *groupptr; -+ cd->hwm -= LINK_SIZE; -+ offset = GET(cd->hwm, 0); -+ recno = GET(codestart, offset); -+ groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno); -+ if (groupptr == NULL) errorcode = ERR53; -+ else PUT(((uschar *)codestart), offset, groupptr - codestart); -+ } - -- /* A back reference needs an additional 2 bytes, plus either one or 5 -- bytes for a repeat. We also need to keep the value of the highest -- back reference. */ -+/* Give an error if there's back reference to a non-existent capturing -+subpattern. */ - -- if (c <= -ESC_REF) -- { -- int refnum = -c - ESC_REF; -- cd->backref_map |= (refnum < 32)? (1 << refnum) : 1; -- if (refnum > cd->top_backref) -- cd->top_backref = refnum; -- length += 2; /* For single back reference */ -- if (ptr[1] == '{' && is_counted_repeat(ptr+2)) -- { -- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); -- if (errorcode != 0) goto PCRE_ERROR_RETURN; -- if ((min == 0 && (max == 1 || max == -1)) || -- (min == 1 && max == -1)) -- length++; -- else length += 5; -- if (ptr[1] == '?') ptr++; -- } -- } -- continue; -- -- case '^': /* Single-byte metacharacters */ -- case '.': -- case '$': -- length++; -- lastitemlength = 1; -- continue; -- -- case '*': /* These repeats won't be after brackets; */ -- case '+': /* those are handled separately */ -- case '?': -- length++; -- goto POSESSIVE; /* A few lines below */ -- -- /* This covers the cases of braced repeats after a single char, metachar, -- class, or back reference. */ -- -- case '{': -- if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR; -- ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode); -- if (errorcode != 0) goto PCRE_ERROR_RETURN; -- -- /* These special cases just insert one extra opcode */ -- -- if ((min == 0 && (max == 1 || max == -1)) || -- (min == 1 && max == -1)) -- length++; -- -- /* These cases might insert additional copies of a preceding character. */ -- -- else -- { -- if (min != 1) -- { -- length -= lastitemlength; /* Uncount the original char or metachar */ -- if (min > 0) length += 3 + lastitemlength; -- } -- length += lastitemlength + ((max > 0)? 3 : 1); -- } -- -- if (ptr[1] == '?') ptr++; /* Needs no extra length */ -- -- POSESSIVE: /* Test for possessive quantifier */ -- if (ptr[1] == '+') -- { -- ptr++; -- length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */ -- } -- continue; -- -- /* An alternation contains an offset to the next branch or ket. If any ims -- options changed in the previous branch(es), and/or if we are in a -- lookbehind assertion, extra space will be needed at the start of the -- branch. This is handled by branch_extra. */ -- -- case '|': -- length += 1 + LINK_SIZE + branch_extra; -- continue; -- -- /* A character class uses 33 characters provided that all the character -- values are less than 256. Otherwise, it uses a bit map for low valued -- characters, and individual items for others. Don't worry about character -- types that aren't allowed in classes - they'll get picked up during the -- compile. A character class that contains only one single-byte character -- uses 2 or 3 bytes, depending on whether it is negated or not. Notice this -- where we can. (In UTF-8 mode we can do this only for chars < 128.) */ -- -- case '[': -- if (*(++ptr) == '^') -- { -- class_optcount = 10; /* Greater than one */ -- ptr++; -- } -- else class_optcount = 0; -- --#ifdef SUPPORT_UTF8 -- class_utf8 = FALSE; --#endif -- -- /* Written as a "do" so that an initial ']' is taken as data */ -- -- if (*ptr != 0) do -- { -- /* Inside \Q...\E everything is literal except \E */ -- -- if (inescq) -- { -- if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER; -- inescq = FALSE; -- ptr += 1; -- continue; -- } -- -- /* Outside \Q...\E, check for escapes */ -- -- if (*ptr == '\\') -- { -- c = check_escape(&ptr, &errorcode, bracount, options, TRUE); -- if (errorcode != 0) goto PCRE_ERROR_RETURN; -- -- /* \b is backspace inside a class; \X is literal */ -- -- if (-c == ESC_b) c = '\b'; -- else if (-c == ESC_X) c = 'X'; -- -- /* \Q enters quoting mode */ -- -- else if (-c == ESC_Q) -- { -- inescq = TRUE; -- continue; -- } -- -- /* Handle escapes that turn into characters */ -- -- if (c >= 0) goto NON_SPECIAL_CHARACTER; -- -- /* Escapes that are meta-things. The normal ones just affect the -- bit map, but Unicode properties require an XCLASS extended item. */ -- -- else -- { -- class_optcount = 10; /* \d, \s etc; make sure > 1 */ --#ifdef SUPPORT_UTF8 -- if (-c == ESC_p || -c == ESC_P) -- { -- if (!class_utf8) -- { -- class_utf8 = TRUE; -- length += LINK_SIZE + 2; -- } -- length += 3; -- } --#endif -- } -- } -- -- /* Check the syntax for POSIX stuff. The bits we actually handle are -- checked during the real compile phase. */ -- -- else if (*ptr == '[' && -- (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && -- check_posix_syntax(ptr, &ptr, cd)) -- { -- ptr++; -- class_optcount = 10; /* Make sure > 1 */ -- } -- -- /* Anything else increments the possible optimization count. We have to -- detect ranges here so that we can compute the number of extra ranges for -- caseless wide characters when UCP support is available. If there are wide -- characters, we are going to have to use an XCLASS, even for single -- characters. */ -- -- else -- { -- int d; -- -- GET_ONE_CHARACTER: -- --#ifdef SUPPORT_UTF8 -- if (utf8) -- { -- int extra = 0; -- GETCHARLEN(c, ptr, extra); -- ptr += extra; -- } -- else c = *ptr; --#else -- c = *ptr; --#endif -- -- /* Come here from handling \ above when it escapes to a char value */ -- -- NON_SPECIAL_CHARACTER: -- class_optcount++; -- -- d = -1; -- if (ptr[1] == '-') -- { -- uschar const *hyptr = ptr++; -- if (ptr[1] == '\\') -- { -- ptr++; -- d = check_escape(&ptr, &errorcode, bracount, options, TRUE); -- if (errorcode != 0) goto PCRE_ERROR_RETURN; -- if (-d == ESC_b) d = '\b'; /* backspace */ -- else if (-d == ESC_X) d = 'X'; /* literal X in a class */ -- } -- else if (ptr[1] != 0 && ptr[1] != ']') -- { -- ptr++; --#ifdef SUPPORT_UTF8 -- if (utf8) -- { -- int extra = 0; -- GETCHARLEN(d, ptr, extra); -- ptr += extra; -- } -- else --#endif -- d = *ptr; -- } -- if (d < 0) ptr = hyptr; /* go back to hyphen as data */ -- } -- -- /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or > -- 127 for caseless matching, we will need to use an XCLASS. */ -- -- if (d >= 0) -- { -- class_optcount = 10; /* Ensure > 1 */ -- if (d < c) -- { -- errorcode = ERR8; -- goto PCRE_ERROR_RETURN; -- } -- --#ifdef SUPPORT_UTF8 -- if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) -- { -- uschar buffer[6]; -- if (!class_utf8) /* Allow for XCLASS overhead */ -- { -- class_utf8 = TRUE; -- length += LINK_SIZE + 2; -- } -- --#ifdef SUPPORT_UCP -- /* If we have UCP support, find out how many extra ranges are -- needed to map the other case of characters within this range. We -- have to mimic the range optimization here, because extending the -- range upwards might push d over a boundary that makes is use -- another byte in the UTF-8 representation. */ -- -- if ((options & PCRE_CASELESS) != 0) -- { -- int occ, ocd; -- int cc = c; -- int origd = d; -- while (get_othercase_range(&cc, origd, &occ, &ocd)) -- { -- if (occ >= c && ocd <= d) continue; /* Skip embedded */ -- -- if (occ < c && ocd >= c - 1) /* Extend the basic range */ -- { /* if there is overlap, */ -- c = occ; /* noting that if occ < c */ -- continue; /* we can't have ocd > d */ -- } /* because a subrange is */ -- if (ocd > d && occ <= d + 1) /* always shorter than */ -- { /* the basic range. */ -- d = ocd; -- continue; -- } -- -- /* An extra item is needed */ -- -- length += 1 + _pcre_ord2utf8(occ, buffer) + -- ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer)); -- } -- } --#endif /* SUPPORT_UCP */ -- -- /* The length of the (possibly extended) range */ -- -- length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer); -- } --#endif /* SUPPORT_UTF8 */ -- -- } -- -- /* We have a single character. There is nothing to be done unless we -- are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must -- allow for an XCL_SINGLE item, doubled for caselessness if there is UCP -- support. */ -- -- else -- { --#ifdef SUPPORT_UTF8 -- if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) -- { -- uschar buffer[6]; -- class_optcount = 10; /* Ensure > 1 */ -- if (!class_utf8) /* Allow for XCLASS overhead */ -- { -- class_utf8 = TRUE; -- length += LINK_SIZE + 2; -- } --#ifdef SUPPORT_UCP -- length += (((options & PCRE_CASELESS) != 0)? 2 : 1) * -- (1 + _pcre_ord2utf8(c, buffer)); --#else /* SUPPORT_UCP */ -- length += 1 + _pcre_ord2utf8(c, buffer); --#endif /* SUPPORT_UCP */ -- } --#endif /* SUPPORT_UTF8 */ -- } -- } -- } -- while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */ -- -- if (*ptr == 0) /* Missing terminating ']' */ -- { -- errorcode = ERR6; -- goto PCRE_ERROR_RETURN; -- } -- -- /* We can optimize when there was only one optimizable character. Repeats -- for positive and negated single one-byte chars are handled by the general -- code. Here, we handle repeats for the class opcodes. */ -- -- if (class_optcount == 1) length += 3; else -- { -- length += 33; -- -- /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier, -- we also need extra for wrapping the whole thing in a sub-pattern. */ -- -- if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2)) -- { -- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); -- if (errorcode != 0) goto PCRE_ERROR_RETURN; -- if ((min == 0 && (max == 1 || max == -1)) || -- (min == 1 && max == -1)) -- length++; -- else length += 5; -- if (ptr[1] == '+') -- { -- ptr++; -- length += 2 + 2*LINK_SIZE; -- } -- else if (ptr[1] == '?') ptr++; -- } -- } -- continue; -- -- /* Brackets may be genuine groups or special things */ -- -- case '(': -- branch_newextra = 0; -- bracket_length = 1 + LINK_SIZE; -- capturing = FALSE; -- -- /* Handle special forms of bracket, which all start (? */ -- -- if (ptr[1] == '?') -- { -- int set, unset; -- int *optset; -- -- switch (c = ptr[2]) -- { -- /* Skip over comments entirely */ -- case '#': -- ptr += 3; -- while (*ptr != 0 && *ptr != ')') ptr++; -- if (*ptr == 0) -- { -- errorcode = ERR18; -- goto PCRE_ERROR_RETURN; -- } -- continue; -- -- /* Non-referencing groups and lookaheads just move the pointer on, and -- then behave like a non-special bracket, except that they don't increment -- the count of extracting brackets. Ditto for the "once only" bracket, -- which is in Perl from version 5.005. */ -- -- case ':': -- case '=': -- case '!': -- case '>': -- ptr += 2; -- break; -- -- /* Named subpatterns are an extension copied from Python */ -- -- case 'P': -- ptr += 3; -- -- /* Handle the definition of a named subpattern */ -- -- if (*ptr == '<') -- { -- const uschar *p; /* Don't amalgamate; some compilers */ -- p = ++ptr; /* grumble at autoincrement in declaration */ -- while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; -- if (*ptr != '>') -- { -- errorcode = ERR42; -- goto PCRE_ERROR_RETURN; -- } -- name_count++; -- if (name_count > MAX_NAME_COUNT) -- { -- errorcode = ERR49; -- goto PCRE_ERROR_RETURN; -- } -- if (ptr - p > max_name_size) -- { -- max_name_size = (ptr - p); -- if (max_name_size > MAX_NAME_SIZE) -- { -- errorcode = ERR48; -- goto PCRE_ERROR_RETURN; -- } -- } -- capturing = TRUE; /* Named parentheses are always capturing */ -- break; /* Go handle capturing parentheses */ -- } -- -- /* Handle back references and recursive calls to named subpatterns */ -- -- if (*ptr == '=' || *ptr == '>') -- { -- length += 3 + 3*LINK_SIZE; /* Allow for the automatic "once" */ -- while ((cd->ctypes[*(++ptr)] & ctype_word) != 0); -- if (*ptr != ')') -- { -- errorcode = ERR42; -- goto PCRE_ERROR_RETURN; -- } -- goto RECURSE_CHECK_QUANTIFIED; -- } -- -- /* Unknown character after (?P */ -- -- errorcode = ERR41; -- goto PCRE_ERROR_RETURN; -- -- /* (?R) specifies a recursive call to the regex, which is an extension -- to provide the facility which can be obtained by (?p{perl-code}) in -- Perl 5.6. In Perl 5.8 this has become (??{perl-code}). -- -- From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to -- the appropriate numbered brackets. This includes both recursive and -- non-recursive calls. (?R) is now synonymous with (?0). */ -- -- case 'R': -- ptr++; -- -- case '0': case '1': case '2': case '3': case '4': -- case '5': case '6': case '7': case '8': case '9': -- ptr += 2; -- if (c != 'R') -- while ((digitab[*(++ptr)] & ctype_digit) != 0); -- if (*ptr != ')') -- { -- errorcode = ERR29; -- goto PCRE_ERROR_RETURN; -- } -- length += 3 + 3*LINK_SIZE; /* Allows for the automatic "once" */ -- -- /* If this item is quantified, it will get wrapped inside brackets so -- as to use the code for quantified brackets. We jump down and use the -- code that handles this for real brackets. Come here from code for -- named recursions/subroutines. */ -- -- RECURSE_CHECK_QUANTIFIED: -- if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{') -- { -- length += 2 + 2 * LINK_SIZE; /* to make bracketed */ -- duplength = 5 + 3 * LINK_SIZE; -- goto HANDLE_QUANTIFIED_BRACKETS; -- } -- continue; -- -- /* (?C) is an extension which provides "callout" - to provide a bit of -- the functionality of the Perl (?{...}) feature. An optional number may -- follow (default is zero). */ -- -- case 'C': -- ptr += 2; -- while ((digitab[*(++ptr)] & ctype_digit) != 0); -- if (*ptr != ')') -- { -- errorcode = ERR39; -- goto PCRE_ERROR_RETURN; -- } -- length += 2 + 2*LINK_SIZE; -- continue; -- -- /* Lookbehinds are in Perl from version 5.005 */ -- -- case '<': -- ptr += 3; -- if (*ptr == '=' || *ptr == '!') -- { -- branch_newextra = 1 + LINK_SIZE; -- length += 1 + LINK_SIZE; /* For the first branch */ -- break; -- } -- errorcode = ERR24; -- goto PCRE_ERROR_RETURN; -- -- /* Conditionals are in Perl from version 5.005. The bracket must either -- be followed by a number (for bracket reference) or by an assertion -- group. PCRE extends this by allowing a name to reference a named group; -- unfortunately, previously 'R' was implemented for a recursion test. -- When this is compiled, we look for the named group 'R' first. At this -- point we just do a basic syntax check. */ -- -- case '(': -- if ((cd->ctypes[ptr[3]] & ctype_word) != 0) -- { -- ptr += 4; -- length += 3; -- while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; -- if (*ptr != ')') -- { -- errorcode = ERR26; -- goto PCRE_ERROR_RETURN; -- } -- } -- else /* An assertion must follow */ -- { -- ptr++; /* Can treat like ':' as far as spacing is concerned */ -- if (ptr[2] != '?' || -- (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') ) -- { -- ptr += 2; /* To get right offset in message */ -- errorcode = ERR28; -- goto PCRE_ERROR_RETURN; -- } -- } -- break; -- -- /* Else loop checking valid options until ) is met. Anything else is an -- error. If we are without any brackets, i.e. at top level, the settings -- act as if specified in the options, so massage the options immediately. -- This is for backward compatibility with Perl 5.004. */ -- -- default: -- set = unset = 0; -- optset = &set; -- ptr += 2; -- -- for (;; ptr++) -- { -- c = *ptr; -- switch (c) -- { -- case 'i': -- *optset |= PCRE_CASELESS; -- continue; -- -- case 'J': -- *optset |= PCRE_DUPNAMES; -- options |= PCRE_JCHANGED; /* Record that it changed */ -- continue; -- -- case 'm': -- *optset |= PCRE_MULTILINE; -- continue; -- -- case 's': -- *optset |= PCRE_DOTALL; -- continue; -- -- case 'x': -- *optset |= PCRE_EXTENDED; -- continue; -- -- case 'X': -- *optset |= PCRE_EXTRA; -- continue; -- -- case 'U': -- *optset |= PCRE_UNGREEDY; -- continue; -- -- case '-': -- optset = &unset; -- continue; -- -- /* A termination by ')' indicates an options-setting-only item; if -- this is at the very start of the pattern (indicated by item_count -- being zero), we use it to set the global options. This is helpful -- when analyzing the pattern for first characters, etc. Otherwise -- nothing is done here and it is handled during the compiling -- process. -- -- We allow for more than one options setting at the start. If such -- settings do not change the existing options, nothing is compiled. -- However, we must leave space just in case something is compiled. -- This can happen for pathological sequences such as (?i)(?-i) -- because the global options will end up with -i set. The space is -- small and not significant. (Before I did this there was a reported -- bug with (?i)(?-i) in a machine-generated pattern.) -- -- [Historical note: Up to Perl 5.8, options settings at top level -- were always global settings, wherever they appeared in the pattern. -- That is, they were equivalent to an external setting. From 5.8 -- onwards, they apply only to what follows (which is what you might -- expect).] */ -- -- case ')': -- if (item_count == 0) -- { -- options = (options | set) & (~unset); -- set = unset = 0; /* To save length */ -- item_count--; /* To allow for several */ -- length += 2; -- } -- -- /* Fall through */ -- -- /* A termination by ':' indicates the start of a nested group with -- the given options set. This is again handled at compile time, but -- we must allow for compiled space if any of the ims options are -- set. We also have to allow for resetting space at the end of -- the group, which is why 4 is added to the length and not just 2. -- If there are several changes of options within the same group, this -- will lead to an over-estimate on the length, but this shouldn't -- matter very much. We also have to allow for resetting options at -- the start of any alternations, which we do by setting -- branch_newextra to 2. */ -- -- case ':': -- if (((set|unset) & PCRE_IMS) != 0) -- { -- length += 4; -- branch_newextra = 2; -- } -- goto END_OPTIONS; -- -- /* Unrecognized option character */ -- -- default: -- errorcode = ERR12; -- goto PCRE_ERROR_RETURN; -- } -- } -- -- /* If we hit a closing bracket, that's it - this is a freestanding -- option-setting. We need to ensure that branch_extra is updated if -- necessary. The only values branch_newextra can have here are 0 or 2. -- If the value is 2, then branch_extra must either be 2 or 5, depending -- on whether this is a lookbehind group or not. */ -- -- END_OPTIONS: -- if (c == ')') -- { -- if (branch_newextra == 2 && -- (branch_extra == 0 || branch_extra == 1+LINK_SIZE)) -- branch_extra += branch_newextra; -- continue; -- } -- -- /* If options were terminated by ':' control comes here. This is a -- non-capturing group with an options change. There is nothing more that -- needs to be done because "capturing" is already set FALSE by default; -- we can just fall through. */ -- -- } -- } -- -- /* Ordinary parentheses, not followed by '?', are capturing unless -- PCRE_NO_AUTO_CAPTURE is set. */ -- -- else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0; -- -- /* Capturing brackets must be counted so we can process escapes in a -- Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need -- an additional 3 bytes of memory per capturing bracket. */ -- -- if (capturing) -- { -- bracount++; -- if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3; -- } -- -- /* Save length for computing whole length at end if there's a repeat that -- requires duplication of the group. Also save the current value of -- branch_extra, and start the new group with the new value. If non-zero, this -- will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */ -- -- if (brastackptr >= sizeof(brastack)/sizeof(int)) -- { -- errorcode = ERR19; -- goto PCRE_ERROR_RETURN; -- } -- -- bralenstack[brastackptr] = branch_extra; -- branch_extra = branch_newextra; -- -- brastack[brastackptr++] = length; -- length += bracket_length; -- continue; -- -- /* Handle ket. Look for subsequent max/min; for certain sets of values we -- have to replicate this bracket up to that many times. If brastackptr is -- 0 this is an unmatched bracket which will generate an error, but take care -- not to try to access brastack[-1] when computing the length and restoring -- the branch_extra value. */ -- -- case ')': -- length += 1 + LINK_SIZE; -- if (brastackptr > 0) -- { -- duplength = length - brastack[--brastackptr]; -- branch_extra = bralenstack[brastackptr]; -- /* This is a paranoid check to stop integer overflow later on */ -- if (duplength > MAX_DUPLENGTH) -- { -- errorcode = ERR50; -- goto PCRE_ERROR_RETURN; -- } -- } -- else duplength = 0; -- -- /* The following code is also used when a recursion such as (?3) is -- followed by a quantifier, because in that case, it has to be wrapped inside -- brackets so that the quantifier works. The value of duplength must be -- set before arrival. */ -- -- HANDLE_QUANTIFIED_BRACKETS: -- -- /* Leave ptr at the final char; for read_repeat_counts this happens -- automatically; for the others we need an increment. */ -- -- if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2)) -- { -- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); -- if (errorcode != 0) goto PCRE_ERROR_RETURN; -- } -- else if (c == '*') { min = 0; max = -1; ptr++; } -- else if (c == '+') { min = 1; max = -1; ptr++; } -- else if (c == '?') { min = 0; max = 1; ptr++; } -- else { min = 1; max = 1; } -- -- /* If the minimum is zero, we have to allow for an OP_BRAZERO before the -- group, and if the maximum is greater than zero, we have to replicate -- maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting -- bracket set. */ -- -- if (min == 0) -- { -- length++; -- if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE); -- } -- -- /* When the minimum is greater than zero, we have to replicate up to -- minval-1 times, with no additions required in the copies. Then, if there -- is a limited maximum we have to replicate up to maxval-1 times allowing -- for a BRAZERO item before each optional copy and nesting brackets for all -- but one of the optional copies. */ -- -- else -- { -- length += (min - 1) * duplength; -- if (max > min) /* Need this test as max=-1 means no limit */ -- length += (max - min) * (duplength + 3 + 2*LINK_SIZE) -- - (2 + 2*LINK_SIZE); -- } -- -- /* Allow space for once brackets for "possessive quantifier" */ -- -- if (ptr[1] == '+') -- { -- ptr++; -- length += 2 + 2*LINK_SIZE; -- } -- continue; -- -- /* Non-special character. It won't be space or # in extended mode, so it is -- always a genuine character. If we are in a \Q...\E sequence, check for the -- end; if not, we have a literal. */ -- -- default: -- NORMAL_CHAR: -- -- if (inescq && c == '\\' && ptr[1] == 'E') -- { -- inescq = FALSE; -- ptr++; -- continue; -- } -- -- length += 2; /* For a one-byte character */ -- lastitemlength = 1; /* Default length of last item for repeats */ -- -- /* In UTF-8 mode, check for additional bytes. */ -- --#ifdef SUPPORT_UTF8 -- if (utf8 && (c & 0xc0) == 0xc0) -- { -- while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */ -- { /* because the end is marked */ -- lastitemlength++; /* by a zero byte. */ -- length++; -- ptr++; -- } -- } --#endif -- -- continue; -- } -- } -- --length += 2 + LINK_SIZE; /* For final KET and END */ -- --if ((options & PCRE_AUTO_CALLOUT) != 0) -- length += 2 + 2*LINK_SIZE; /* For final callout */ -- --if (length > MAX_PATTERN_SIZE) -- { -- errorcode = ERR20; -- goto PCRE_EARLY_ERROR_RETURN; -- } -- --/* Compute the size of data block needed and get it, either from malloc or --externally provided function. Integer overflow should no longer be possible --because nowadays we limit the maximum value of name_count and max_name size. */ -- --size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); --re = (real_pcre *)(pcre_malloc)(size); -- --if (re == NULL) -- { -- errorcode = ERR21; -- goto PCRE_EARLY_ERROR_RETURN; -- } -- --/* Put in the magic number, and save the sizes, options, and character table --pointer. NULL is used for the default character tables. The nullpad field is at --the end; it's there to help in the case when a regex compiled on a system with --4-byte pointers is run on another with 8-byte pointers. */ -- --re->magic_number = MAGIC_NUMBER; --re->size = size; --re->options = options; --re->dummy1 = 0; --re->name_table_offset = sizeof(real_pcre); --re->name_entry_size = max_name_size + 3; --re->name_count = name_count; --re->ref_count = 0; --re->tables = (tables == _pcre_default_tables)? NULL : tables; --re->nullpad = NULL; -- --/* The starting points of the name/number translation table and of the code are --passed around in the compile data block. */ -- --cd->names_found = 0; --cd->name_entry_size = max_name_size + 3; --cd->name_table = (uschar *)re + re->name_table_offset; --codestart = cd->name_table + re->name_entry_size * re->name_count; --cd->start_code = codestart; --cd->start_pattern = (const uschar *)pattern; --cd->req_varyopt = 0; --cd->nopartial = FALSE; -- --/* Set up a starting, non-extracting bracket, then compile the expression. On --error, errorcode will be set non-zero, so we don't need to look at the result --of the function here. */ -- --ptr = (const uschar *)pattern; --code = (uschar *)codestart; --*code = OP_BRA; --bracount = 0; --(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr, -- &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd); --re->top_bracket = bracount; --re->top_backref = cd->top_backref; -- --if (cd->nopartial) re->options |= PCRE_NOPARTIAL; -- --/* If not reached end of pattern on success, there's an excess bracket. */ -- --if (errorcode == 0 && *ptr != 0) errorcode = ERR22; -- --/* Fill in the terminating state and check for disastrous overflow, but --if debugging, leave the test till after things are printed out. */ -- --*code++ = OP_END; -- --#ifndef DEBUG --if (code - codestart > length) errorcode = ERR23; --#endif -- --/* Give an error if there's back reference to a non-existent capturing --subpattern. */ -- --if (re->top_backref > re->top_bracket) errorcode = ERR15; -+if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; - - /* Failed to compile, or error while post-processing */ - - if (errorcode != 0) - { - (pcre_free)(re); -- PCRE_ERROR_RETURN: -- *erroroffset = ptr - (const uschar *)pattern; - PCRE_EARLY_ERROR_RETURN: -+ *erroroffset = ptr - (const uschar *)pattern; -+#ifdef SUPPORT_UTF8 -+ PCRE_UTF8_ERROR_RETURN: -+#endif - *errorptr = error_texts[errorcode]; - if (errorcodeptr != NULL) *errorcodeptr = errorcode; - return NULL; -@@ -5180,15 +5333,15 @@ - the pattern is anchored by virtue of ^ characters or \A or anything else (such - as starting with .* when DOTALL is set). - --Otherwise, if we know what the first character has to be, save it, because that -+Otherwise, if we know what the first byte has to be, save it, because that - speeds up unanchored matches no end. If not, see if we can set the - PCRE_STARTLINE flag. This is helpful for multiline matches when all branches - start with ^. and also when all branches start with .* for non-DOTALL matches. - */ - --if ((options & PCRE_ANCHORED) == 0) -+if ((re->options & PCRE_ANCHORED) == 0) - { -- int temp_options = options; -+ int temp_options = re->options; /* May get changed during these scans */ - if (is_anchored(codestart, &temp_options, 0, cd->backref_map)) - re->options |= PCRE_ANCHORED; - else -@@ -5273,7 +5426,7 @@ - if (errorcodeptr != NULL) *errorcodeptr = ERR23; - return NULL; - } --#endif -+#endif /* DEBUG */ - - return (pcre *)re; - } -diff -ruN ../pcre.orig/pcrelib/pcre_exec.c ./pcrelib/pcre_exec.c ---- ../pcre.orig/pcrelib/pcre_exec.c Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/pcre_exec.c Fri Feb 9 22:31:19 2007 -@@ -6,7 +6,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -42,25 +42,22 @@ - pattern matching using an NFA algorithm, trying to mimic Perl as closely as - possible. There are also some static supporting functions. */ - --#define NLBLOCK md /* The block containing newline information */ -+#define NLBLOCK md /* Block containing newline information */ -+#define PSSTART start_subject /* Field containing processed string start */ -+#define PSEND end_subject /* Field containing processed string end */ -+ - #include "pcre_internal.h" - -+/* The chain of eptrblocks for tail recursions uses memory in stack workspace, -+obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */ - --/* Structure for building a chain of data that actually lives on the --stack, for holding the values of the subject pointer at the start of each --subpattern, so as to detect when an empty string has been matched by a --subpattern - to break infinite loops. When NO_RECURSE is set, these blocks --are on the heap, not on the stack. */ -- --typedef struct eptrblock { -- struct eptrblock *epb_prev; -- USPTR epb_saved_eptr; --} eptrblock; -+#define EPTR_WORK_SIZE (1000) - - /* Flag bits for the match() function */ - --#define match_condassert 0x01 /* Called to check a condition assertion */ --#define match_isgroup 0x02 /* Set if start of bracketed group */ -+#define match_condassert 0x01 /* Called to check a condition assertion */ -+#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */ -+#define match_tail_recursed 0x04 /* Tail recursive call */ - - /* Non-error returns from the match() function. Error returns are externally - defined PCRE_ERROR_xxx codes, which are all negative. */ -@@ -101,7 +98,7 @@ - static void - pchars(const uschar *p, int length, BOOL is_subject, match_data *md) - { --int c; -+unsigned int c; - if (is_subject && length > md->end_subject - p) length = md->end_subject - p; - while (length-- > 0) - if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); -@@ -291,7 +288,6 @@ - - BOOL Xcur_is_word; - BOOL Xcondition; -- BOOL Xminimize; - BOOL Xprev_is_word; - - unsigned long int Xoriginal_ims; -@@ -303,11 +299,10 @@ - int Xprop_category; - int Xprop_chartype; - int Xprop_script; -- int *Xprop_test_variable; - #endif - - int Xctype; -- int Xfc; -+ unsigned int Xfc; - int Xfi; - int Xlength; - int Xmax; -@@ -340,10 +335,7 @@ - * Match from current position * - *************************************************/ - --/* On entry ecode points to the first opcode, and eptr to the first character --in the subject string, while eptrb holds the value of eptr at the start of the --last bracketed group - used for breaking infinite loops matching zero-length --strings. This function is called recursively in many circumstances. Whenever it -+/* This function is called recursively in many circumstances. Whenever it - returns a negative (error) response, the outer incarnation must also return the - same response. - -@@ -353,8 +345,8 @@ - made performance worse. - - Arguments: -- eptr pointer in subject -- ecode position in code -+ eptr pointer to current character in subject -+ ecode pointer to current position in compiled code - offset_top current top pointer - md pointer to "static" info for the match - ims current /i, /m, and /s options -@@ -362,7 +354,9 @@ - brackets - for testing for empty matches - flags can contain - match_condassert - this is an assertion condition -- match_isgroup - this is the start of a bracketed group -+ match_cbegroup - this is the start of an unlimited repeat -+ group that can match an empty string -+ match_tail_recursed - this is a tail_recursed group - rdepth the recursion depth - - Returns: MATCH_MATCH if matched ) these values are >= 0 -@@ -377,14 +371,16 @@ - int flags, unsigned int rdepth) - { - /* These variables do not need to be preserved over recursion in this function, --so they can be ordinary variables in all cases. Mark them with "register" --because they are used a lot in loops. */ -+so they can be ordinary variables in all cases. Mark some of them with -+"register" because they are used a lot in loops. */ - - register int rrc; /* Returns from recursive calls */ - register int i; /* Used for loops not involving calls to RMATCH() */ --register unsigned int c; /* Character values not kept over RMATCH() calls */ -+register unsigned int c; /* Character values not kept over RMATCH() calls */ - register BOOL utf8; /* Local copy of UTF-8 flag for speed */ - -+BOOL minimize, possessive; /* Quantifier options */ -+ - /* When recursion is not being used, all "local" variables that have to be - preserved over calls to RMATCH() are part of a "frame" which is obtained from - heap storage. Set up the top-level frame here; others are obtained from the -@@ -434,7 +430,6 @@ - - #define cur_is_word frame->Xcur_is_word - #define condition frame->Xcondition --#define minimize frame->Xminimize - #define prev_is_word frame->Xprev_is_word - - #define original_ims frame->Xoriginal_ims -@@ -446,7 +441,6 @@ - #define prop_category frame->Xprop_category - #define prop_chartype frame->Xprop_chartype - #define prop_script frame->Xprop_script --#define prop_test_variable frame->Xprop_test_variable - #endif - - #define ctype frame->Xctype -@@ -470,7 +464,7 @@ - get preserved during recursion in the normal way. In this environment, fi and - i, and fc and c, can be the same variables. */ - --#else -+#else /* NO_RECURSE not defined */ - #define fi i - #define fc c - -@@ -489,7 +483,6 @@ - /* that do not have to be preserved over */ - BOOL cur_is_word; /* a recursive call to RMATCH(). */ - BOOL condition; --BOOL minimize; - BOOL prev_is_word; - - unsigned long int original_ims; -@@ -501,7 +494,6 @@ - int prop_category; - int prop_chartype; - int prop_script; --int *prop_test_variable; - #endif - - int ctype; -@@ -516,7 +508,7 @@ - int stacksave[REC_STACK_SAVE_MAX]; - - eptrblock newptrb; --#endif -+#endif /* NO_RECURSE */ - - /* These statements are here to stop the compiler complaining about unitialized - variables. */ -@@ -524,9 +516,9 @@ - #ifdef SUPPORT_UCP - prop_value = 0; - prop_fail_result = 0; --prop_test_variable = NULL; - #endif - -+ - /* This label is used for tail recursion, which is used in a few cases even - when NO_RECURSE is not defined, in order to reduce the amount of stack that is - used. Thanks to Ian Taylor for noticing this possibility and sending the -@@ -556,24 +548,34 @@ - utf8 = FALSE; - #endif - --/* At the start of a bracketed group, add the current subject pointer to the --stack of such pointers, to be re-instated at the end of the group when we hit --the closing ket. When match() is called in other circumstances, we don't add to --this stack. */ -+/* At the start of a group with an unlimited repeat that may match an empty -+string, the match_cbegroup flag is set. When this is the case, add the current -+subject pointer to the chain of such remembered pointers, to be checked when we -+hit the closing ket, in order to break infinite loops that match no characters. -+When match() is called in other circumstances, don't add to the chain. If this -+is a tail recursion, use a block from the workspace, as the one on the stack is -+already used. */ - --if ((flags & match_isgroup) != 0) -+if ((flags & match_cbegroup) != 0) - { -- newptrb.epb_prev = eptrb; -- newptrb.epb_saved_eptr = eptr; -- eptrb = &newptrb; -+ eptrblock *p; -+ if ((flags & match_tail_recursed) != 0) -+ { -+ if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT); -+ p = md->eptrchain + md->eptrn++; -+ } -+ else p = &newptrb; -+ p->epb_saved_eptr = eptr; -+ p->epb_prev = eptrb; -+ eptrb = p; - } - --/* Now start processing the operations. */ -+/* Now start processing the opcodes. */ - - for (;;) - { -+ minimize = possessive = FALSE; - op = *ecode; -- minimize = FALSE; - - /* For partial matching, remember if we ever hit the end of the subject after - matching at least one subject character. */ -@@ -583,33 +585,30 @@ - eptr > md->start_match) - md->hitend = TRUE; - -- /* Opening capturing bracket. If there is space in the offset vector, save -- the current subject position in the working slot at the top of the vector. We -- mustn't change the current values of the data slot, because they may be set -- from a previous iteration of this group, and be referred to by a reference -- inside the group. -- -- If the bracket fails to match, we need to restore this value and also the -- values of the final offsets, in case they were set by a previous iteration of -- the same bracket. -- -- If there isn't enough space in the offset vector, treat this as if it were a -- non-capturing bracket. Don't worry about setting the flag for the error case -- here; that is handled in the code for KET. */ -- -- if (op > OP_BRA) -+ switch(op) - { -- number = op - OP_BRA; -- -- /* For extended extraction brackets (large number), we have to fish out the -- number from a dummy opcode at the start. */ -- -- if (number > EXTRACT_BASIC_MAX) -- number = GET2(ecode, 2+LINK_SIZE); -+ /* Handle a capturing bracket. If there is space in the offset vector, save -+ the current subject position in the working slot at the top of the vector. -+ We mustn't change the current values of the data slot, because they may be -+ set from a previous iteration of this group, and be referred to by a -+ reference inside the group. -+ -+ If the bracket fails to match, we need to restore this value and also the -+ values of the final offsets, in case they were set by a previous iteration -+ of the same bracket. -+ -+ If there isn't enough space in the offset vector, treat this as if it were -+ a non-capturing bracket. Don't worry about setting the flag for the error -+ case here; that is handled in the code for KET. */ -+ -+ case OP_CBRA: -+ case OP_SCBRA: -+ number = GET2(ecode, 1+LINK_SIZE); - offset = number << 1; - - #ifdef DEBUG -- printf("start bracket %d subject=", number); -+ printf("start bracket %d\n", number); -+ printf("subject="); - pchars(eptr, 16, TRUE, md); - printf("\n"); - #endif -@@ -624,10 +623,11 @@ - DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); - md->offset_vector[md->offset_end - number] = eptr - md->start_subject; - -+ flags = (op == OP_SCBRA)? match_cbegroup : 0; - do - { -- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, -- match_isgroup); -+ RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, -+ ims, eptrb, flags); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - md->capture_last = save_capture_last; - ecode += GET(ecode, 1); -@@ -643,39 +643,35 @@ - RRETURN(MATCH_NOMATCH); - } - -- /* Insufficient room for saving captured contents */ -- -- else op = OP_BRA; -- } -- -- /* Other types of node can be handled by a switch */ -+ /* Insufficient room for saving captured contents. Treat as a non-capturing -+ bracket. */ - -- switch(op) -- { -- case OP_BRA: /* Non-capturing bracket: optimized */ -- DPRINTF(("start bracket 0\n")); -- -- /* Loop for all the alternatives */ -+ DPRINTF(("insufficient capture room: treat as non-capturing\n")); - -+ /* Non-capturing bracket. Loop for all the alternatives. When we get to the -+ final alternative within the brackets, we would return the result of a -+ recursive call to match() whatever happened. We can reduce stack usage by -+ turning this into a tail recursion. */ -+ -+ case OP_BRA: -+ case OP_SBRA: -+ DPRINTF(("start non-capturing bracket\n")); -+ flags = (op >= OP_SBRA)? match_cbegroup : 0; - for (;;) - { -- /* When we get to the final alternative within the brackets, we would -- return the result of a recursive call to match() whatever happened. We -- can reduce stack usage by turning this into a tail recursion. */ -- - if (ecode[GET(ecode, 1)] != OP_ALT) -- { -- ecode += 1 + LINK_SIZE; -- flags = match_isgroup; -- DPRINTF(("bracket 0 tail recursion\n")); -- goto TAIL_RECURSE; -- } -+ { -+ ecode += _pcre_OP_lengths[*ecode]; -+ flags |= match_tail_recursed; -+ DPRINTF(("bracket 0 tail recursion\n")); -+ goto TAIL_RECURSE; -+ } - - /* For non-final alternatives, continue the loop for a NOMATCH result; - otherwise return. */ - -- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, -- match_isgroup); -+ RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, -+ eptrb, flags); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode += GET(ecode, 1); - } -@@ -688,54 +684,72 @@ - obeyed, we can use tail recursion to avoid using another stack frame. */ - - case OP_COND: -- if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */ -+ case OP_SCOND: -+ if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */ -+ { -+ offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ -+ condition = md->recursive != NULL && -+ (offset == RREF_ANY || offset == md->recursive->group_num); -+ ecode += condition? 3 : GET(ecode, 1); -+ } -+ -+ else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */ - { - offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ -- condition = (offset == CREF_RECURSE * 2)? -- (md->recursive != NULL) : -- (offset < offset_top && md->offset_vector[offset] >= 0); -- ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1)); -- flags = match_isgroup; -- goto TAIL_RECURSE; -+ condition = offset < offset_top && md->offset_vector[offset] >= 0; -+ ecode += condition? 3 : GET(ecode, 1); -+ } -+ -+ else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */ -+ { -+ condition = FALSE; -+ ecode += GET(ecode, 1); - } - - /* The condition is an assertion. Call match() to evaluate it - setting -- the final argument TRUE causes it to stop at the end of an assertion. */ -+ the final argument match_condassert causes it to stop at the end of an -+ assertion. */ - - else - { - RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, -- match_condassert | match_isgroup); -+ match_condassert); - if (rrc == MATCH_MATCH) - { -- ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2); -+ condition = TRUE; -+ ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); - while (*ecode == OP_ALT) ecode += GET(ecode, 1); - } - else if (rrc != MATCH_NOMATCH) - { - RRETURN(rrc); /* Need braces because of following else */ - } -- else ecode += GET(ecode, 1); -+ else -+ { -+ condition = FALSE; -+ ecode += GET(ecode, 1); -+ } -+ } - -- /* We are now at the branch that is to be obeyed. As there is only one, -- we can use tail recursion to avoid using another stack frame. */ -+ /* We are now at the branch that is to be obeyed. As there is only one, -+ we can use tail recursion to avoid using another stack frame. If the second -+ alternative doesn't exist, we can just plough on. */ - -+ if (condition || *ecode == OP_ALT) -+ { - ecode += 1 + LINK_SIZE; -- flags = match_isgroup; -+ flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0); - goto TAIL_RECURSE; - } -- /* Control never reaches here */ -- -- /* Skip over conditional reference or large extraction number data if -- encountered. */ -- -- case OP_CREF: -- case OP_BRANUMBER: -- ecode += 3; -+ else -+ { -+ ecode += 1 + LINK_SIZE; -+ } - break; - -- /* End of the pattern. If we are in a recursion, we should restore the -- offsets appropriately and continue from after the call. */ -+ -+ /* End of the pattern. If we are in a top-level recursion, we should -+ restore the offsets appropriately and continue from after the call. */ - - case OP_END: - if (md->recursive != NULL && md->recursive->group_num == 0) -@@ -777,8 +791,7 @@ - case OP_ASSERTBACK: - do - { -- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, -- match_isgroup); -+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0); - if (rrc == MATCH_MATCH) break; - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode += GET(ecode, 1); -@@ -804,8 +817,7 @@ - case OP_ASSERTBACK_NOT: - do - { -- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, -- match_isgroup); -+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0); - if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode += GET(ecode,1); -@@ -826,8 +838,8 @@ - #ifdef SUPPORT_UTF8 - if (utf8) - { -- c = GET(ecode,1); -- for (i = 0; i < c; i++) -+ i = GET(ecode, 1); -+ while (i-- > 0) - { - eptr--; - if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); -@@ -840,7 +852,7 @@ - /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ - - { -- eptr -= GET(ecode,1); -+ eptr -= GET(ecode, 1); - if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); - } - -@@ -897,13 +909,8 @@ - case OP_RECURSE: - { - callpat = md->start_code + GET(ecode, 1); -- new_recursive.group_num = *callpat - OP_BRA; -- -- /* For extended extraction brackets (large number), we have to fish out -- the number from a dummy opcode at the start. */ -- -- if (new_recursive.group_num > EXTRACT_BASIC_MAX) -- new_recursive.group_num = GET2(callpat, 2+LINK_SIZE); -+ new_recursive.group_num = (callpat == md->start_code)? 0 : -+ GET2(callpat, 1 + LINK_SIZE); - - /* Add to "recursing stack" */ - -@@ -936,10 +943,11 @@ - restore the offset and recursion data. */ - - DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); -+ flags = (*callpat >= OP_SBRA)? match_cbegroup : 0; - do - { -- RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims, -- eptrb, match_isgroup); -+ RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, -+ md, ims, eptrb, flags); - if (rrc == MATCH_MATCH) - { - DPRINTF(("Recursion matched\n")); -@@ -983,7 +991,7 @@ - do - { - RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, -- eptrb, match_isgroup); -+ eptrb, 0); - if (rrc == MATCH_MATCH) break; - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode += GET(ecode,1); -@@ -997,7 +1005,7 @@ - /* Continue as from after the assertion, updating the offsets high water - mark, since extracts may have been taken. */ - -- do ecode += GET(ecode,1); while (*ecode == OP_ALT); -+ do ecode += GET(ecode, 1); while (*ecode == OP_ALT); - - offset_top = md->end_offset_top; - eptr = md->end_match_ptr; -@@ -1031,15 +1039,15 @@ - RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode = prev; -- flags = match_isgroup; -+ flags = match_tail_recursed; - goto TAIL_RECURSE; - } - else /* OP_KETRMAX */ - { -- RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); -+ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode += 1 + LINK_SIZE; -- flags = 0; -+ flags = match_tail_recursed; - goto TAIL_RECURSE; - } - /* Control never gets here */ -@@ -1060,38 +1068,44 @@ - case OP_BRAZERO: - { - next = ecode+1; -- RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup); -+ RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - do next += GET(next,1); while (*next == OP_ALT); -- ecode = next + 1+LINK_SIZE; -+ ecode = next + 1 + LINK_SIZE; - } - break; - - case OP_BRAMINZERO: - { - next = ecode+1; -- do next += GET(next,1); while (*next == OP_ALT); -- RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, -- match_isgroup); -+ do next += GET(next, 1); while (*next == OP_ALT); -+ RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode++; - } - break; - -- /* End of a group, repeated or non-repeating. If we are at the end of -- an assertion "group", stop matching and return MATCH_MATCH, but record the -- current high water mark for use by positive assertions. Do this also -- for the "once" (not-backup up) groups. */ -+ /* End of a group, repeated or non-repeating. */ - - case OP_KET: - case OP_KETRMIN: - case OP_KETRMAX: - prev = ecode - GET(ecode, 1); -- saved_eptr = eptrb->epb_saved_eptr; - -- /* Back up the stack of bracket start pointers. */ -+ /* If this was a group that remembered the subject start, in order to break -+ infinite repeats of empty string matches, retrieve the subject start from -+ the chain. Otherwise, set it NULL. */ -+ -+ if (*prev >= OP_SBRA) -+ { -+ saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ -+ eptrb = eptrb->epb_prev; /* Backup to previous group */ -+ } -+ else saved_eptr = NULL; - -- eptrb = eptrb->epb_prev; -+ /* If we are at the end of an assertion group, stop matching and return -+ MATCH_MATCH, but record the current high water mark for use by positive -+ assertions. Do this also for the "once" (atomic) groups. */ - - if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || - *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || -@@ -1102,18 +1116,15 @@ - RRETURN(MATCH_MATCH); - } - -- /* In all other cases except a conditional group we have to check the -- group number back at the start and if necessary complete handling an -- extraction by setting the offsets and bumping the high water mark. */ -+ /* For capturing groups we have to check the group number back at the start -+ and if necessary complete handling an extraction by setting the offsets and -+ bumping the high water mark. Note that whole-pattern recursion is coded as -+ a recurse into group 0, so it won't be picked up here. Instead, we catch it -+ when the OP_END is reached. Other recursion is handled here. */ - -- if (*prev != OP_COND) -+ if (*prev == OP_CBRA || *prev == OP_SCBRA) - { -- number = *prev - OP_BRA; -- -- /* For extended extraction brackets (large number), we have to fish out -- the number from a dummy opcode at the start. */ -- -- if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE); -+ number = GET2(prev, 1+LINK_SIZE); - offset = number << 1; - - #ifdef DEBUG -@@ -1121,42 +1132,34 @@ - printf("\n"); - #endif - -- /* Test for a numbered group. This includes groups called as a result -- of recursion. Note that whole-pattern recursion is coded as a recurse -- into group 0, so it won't be picked up here. Instead, we catch it when -- the OP_END is reached. */ -- -- if (number > 0) -+ md->capture_last = number; -+ if (offset >= md->offset_max) md->offset_overflow = TRUE; else - { -- md->capture_last = number; -- if (offset >= md->offset_max) md->offset_overflow = TRUE; else -- { -- md->offset_vector[offset] = -- md->offset_vector[md->offset_end - number]; -- md->offset_vector[offset+1] = eptr - md->start_subject; -- if (offset_top <= offset) offset_top = offset + 2; -- } -- -- /* Handle a recursively called group. Restore the offsets -- appropriately and continue from after the call. */ -- -- if (md->recursive != NULL && md->recursive->group_num == number) -- { -- recursion_info *rec = md->recursive; -- DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); -- md->recursive = rec->prevrec; -- md->start_match = rec->save_start; -- memcpy(md->offset_vector, rec->offset_save, -- rec->saved_max * sizeof(int)); -- ecode = rec->after_call; -- ims = original_ims; -- break; -- } -+ md->offset_vector[offset] = -+ md->offset_vector[md->offset_end - number]; -+ md->offset_vector[offset+1] = eptr - md->start_subject; -+ if (offset_top <= offset) offset_top = offset + 2; -+ } -+ -+ /* Handle a recursively called group. Restore the offsets -+ appropriately and continue from after the call. */ -+ -+ if (md->recursive != NULL && md->recursive->group_num == number) -+ { -+ recursion_info *rec = md->recursive; -+ DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); -+ md->recursive = rec->prevrec; -+ md->start_match = rec->save_start; -+ memcpy(md->offset_vector, rec->offset_save, -+ rec->saved_max * sizeof(int)); -+ ecode = rec->after_call; -+ ims = original_ims; -+ break; - } - } - -- /* Reset the value of the ims flags, in case they got changed during -- the group. */ -+ /* For both capturing and non-capturing groups, reset the value of the ims -+ flags, in case they got changed during the group. */ - - ims = original_ims; - DPRINTF(("ims reset to %02lx\n", ims)); -@@ -1177,20 +1180,22 @@ - preceding bracket, in the appropriate order. In the second case, we can use - tail recursion to avoid using another stack frame. */ - -+ flags = (*prev >= OP_SBRA)? match_cbegroup : 0; -+ - if (*ecode == OP_KETRMIN) - { - RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode = prev; -- flags = match_isgroup; -+ flags |= match_tail_recursed; - goto TAIL_RECURSE; - } - else /* OP_KETRMAX */ - { -- RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); -+ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode += 1 + LINK_SIZE; -- flags = 0; -+ flags = match_tail_recursed; - goto TAIL_RECURSE; - } - /* Control never gets here */ -@@ -1202,9 +1207,7 @@ - if ((ims & PCRE_MULTILINE) != 0) - { - if (eptr != md->start_subject && -- (eptr == md->end_subject || -- eptr < md->start_subject + md->nllen || -- !IS_NEWLINE(eptr - md->nllen))) -+ (eptr == md->end_subject || !WAS_NEWLINE(eptr))) - RRETURN(MATCH_NOMATCH); - ecode++; - break; -@@ -1244,7 +1247,7 @@ - if (!md->endonly) - { - if (eptr != md->end_subject && -- (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr))) -+ (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) - RRETURN(MATCH_NOMATCH); - ecode++; - break; -@@ -1263,7 +1266,7 @@ - - case OP_EODN: - if (eptr != md->end_subject && -- (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr))) -+ (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) - RRETURN(MATCH_NOMATCH); - ecode++; - break; -@@ -1319,8 +1322,7 @@ - case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) - { -- if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)) -- RRETURN(MATCH_NOMATCH); -+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - } - if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); - if (utf8) -@@ -1414,6 +1416,26 @@ - ecode++; - break; - -+ case OP_ANYNL: -+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); -+ GETCHARINCTEST(c, eptr); -+ switch(c) -+ { -+ default: RRETURN(MATCH_NOMATCH); -+ case 0x000d: -+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; -+ break; -+ case 0x000a: -+ case 0x000b: -+ case 0x000c: -+ case 0x0085: -+ case 0x2028: -+ case 0x2029: -+ break; -+ } -+ ecode++; -+ break; -+ - #ifdef SUPPORT_UCP - /* Check the next character by Unicode property. We will get here only - if the support is in the binary; otherwise a compile-time error occurs. */ -@@ -1456,7 +1478,6 @@ - - default: - RRETURN(PCRE_ERROR_INTERNAL); -- break; - } - - ecode += 3; -@@ -1926,7 +1947,7 @@ - - else - { -- int dc; -+ unsigned int dc; - GETCHARINC(dc, eptr); - ecode += length; - -@@ -1953,13 +1974,17 @@ - } - break; - -- /* Match a single character repeatedly; different opcodes share code. */ -+ /* Match a single character repeatedly. */ - - case OP_EXACT: - min = max = GET2(ecode, 1); - ecode += 3; - goto REPEATCHAR; - -+ case OP_POSUPTO: -+ possessive = TRUE; -+ /* Fall through */ -+ - case OP_UPTO: - case OP_MINUPTO: - min = 0; -@@ -1968,6 +1993,27 @@ - ecode += 3; - goto REPEATCHAR; - -+ case OP_POSSTAR: -+ possessive = TRUE; -+ min = 0; -+ max = INT_MAX; -+ ecode++; -+ goto REPEATCHAR; -+ -+ case OP_POSPLUS: -+ possessive = TRUE; -+ min = 1; -+ max = INT_MAX; -+ ecode++; -+ goto REPEATCHAR; -+ -+ case OP_POSQUERY: -+ possessive = TRUE; -+ min = 0; -+ max = 1; -+ ecode++; -+ goto REPEATCHAR; -+ - case OP_STAR: - case OP_MINSTAR: - case OP_PLUS: -@@ -2003,10 +2049,9 @@ - uschar occhars[8]; - - #ifdef SUPPORT_UCP -- int othercase; -+ unsigned int othercase; - if ((ims & PCRE_CASELESS) != 0 && -- (othercase = _pcre_ucp_othercase(fc)) >= 0 && -- othercase >= 0) -+ (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR) - oclength = _pcre_ord2utf8(othercase, occhars); - #endif /* SUPPORT_UCP */ - -@@ -2042,7 +2087,8 @@ - } - /* Control never gets here */ - } -- else -+ -+ else /* Maximize */ - { - pp = eptr; - for (i = min; i < max; i++) -@@ -2056,6 +2102,8 @@ - eptr += oclength; - } - } -+ -+ if (possessive) continue; - while (eptr >= pp) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -2110,7 +2158,7 @@ - } - /* Control never gets here */ - } -- else -+ else /* Maximize */ - { - pp = eptr; - for (i = min; i < max; i++) -@@ -2118,6 +2166,7 @@ - if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break; - eptr++; - } -+ if (possessive) continue; - while (eptr >= pp) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -2146,7 +2195,7 @@ - } - /* Control never gets here */ - } -- else -+ else /* Maximize */ - { - pp = eptr; - for (i = min; i < max; i++) -@@ -2154,6 +2203,7 @@ - if (eptr >= md->end_subject || fc != *eptr) break; - eptr++; - } -+ if (possessive) continue; - while (eptr >= pp) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -2206,6 +2256,34 @@ - ecode += 3; - goto REPEATNOTCHAR; - -+ case OP_NOTPOSSTAR: -+ possessive = TRUE; -+ min = 0; -+ max = INT_MAX; -+ ecode++; -+ goto REPEATNOTCHAR; -+ -+ case OP_NOTPOSPLUS: -+ possessive = TRUE; -+ min = 1; -+ max = INT_MAX; -+ ecode++; -+ goto REPEATNOTCHAR; -+ -+ case OP_NOTPOSQUERY: -+ possessive = TRUE; -+ min = 0; -+ max = 1; -+ ecode++; -+ goto REPEATNOTCHAR; -+ -+ case OP_NOTPOSUPTO: -+ possessive = TRUE; -+ min = 0; -+ max = GET2(ecode, 1); -+ ecode += 3; -+ goto REPEATNOTCHAR; -+ - case OP_NOTSTAR: - case OP_NOTMINSTAR: - case OP_NOTPLUS: -@@ -2245,7 +2323,7 @@ - /* UTF-8 mode */ - if (utf8) - { -- register int d; -+ register unsigned int d; - for (i = 1; i <= min; i++) - { - GETCHARINC(d, eptr); -@@ -2270,7 +2348,7 @@ - /* UTF-8 mode */ - if (utf8) - { -- register int d; -+ register unsigned int d; - for (fi = min;; fi++) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -2306,7 +2384,7 @@ - /* UTF-8 mode */ - if (utf8) - { -- register int d; -+ register unsigned int d; - for (i = min; i < max; i++) - { - int len = 1; -@@ -2316,7 +2394,8 @@ - if (fc == d) break; - eptr += len; - } -- for(;;) -+ if (possessive) continue; -+ for(;;) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); -@@ -2333,6 +2412,7 @@ - if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break; - eptr++; - } -+ if (possessive) continue; - while (eptr >= pp) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -2354,7 +2434,7 @@ - /* UTF-8 mode */ - if (utf8) - { -- register int d; -+ register unsigned int d; - for (i = 1; i <= min; i++) - { - GETCHARINC(d, eptr); -@@ -2377,7 +2457,7 @@ - /* UTF-8 mode */ - if (utf8) - { -- register int d; -+ register unsigned int d; - for (fi = min;; fi++) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -2412,7 +2492,7 @@ - /* UTF-8 mode */ - if (utf8) - { -- register int d; -+ register unsigned int d; - for (i = min; i < max; i++) - { - int len = 1; -@@ -2421,6 +2501,7 @@ - if (fc == d) break; - eptr += len; - } -+ if (possessive) continue; - for(;;) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -2438,6 +2519,7 @@ - if (eptr >= md->end_subject || fc == *eptr) break; - eptr++; - } -+ if (possessive) continue; - while (eptr >= pp) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -2469,6 +2551,34 @@ - ecode += 3; - goto REPEATTYPE; - -+ case OP_TYPEPOSSTAR: -+ possessive = TRUE; -+ min = 0; -+ max = INT_MAX; -+ ecode++; -+ goto REPEATTYPE; -+ -+ case OP_TYPEPOSPLUS: -+ possessive = TRUE; -+ min = 1; -+ max = INT_MAX; -+ ecode++; -+ goto REPEATTYPE; -+ -+ case OP_TYPEPOSQUERY: -+ possessive = TRUE; -+ min = 0; -+ max = 1; -+ ecode++; -+ goto REPEATTYPE; -+ -+ case OP_TYPEPOSUPTO: -+ possessive = TRUE; -+ min = 0; -+ max = GET2(ecode, 1); -+ ecode += 3; -+ goto REPEATTYPE; -+ - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: -@@ -2571,7 +2681,6 @@ - - default: - RRETURN(PCRE_ERROR_INTERNAL); -- break; - } - } - -@@ -2611,9 +2720,7 @@ - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject || -- ((ims & PCRE_DOTALL) == 0 && -- eptr <= md->end_subject - md->nllen && -- IS_NEWLINE(eptr))) -+ ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) - RRETURN(MATCH_NOMATCH); - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; -@@ -2624,6 +2731,28 @@ - eptr += min; - break; - -+ case OP_ANYNL: -+ for (i = 1; i <= min; i++) -+ { -+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); -+ GETCHARINC(c, eptr); -+ switch(c) -+ { -+ default: RRETURN(MATCH_NOMATCH); -+ case 0x000d: -+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; -+ break; -+ case 0x000a: -+ case 0x000b: -+ case 0x000c: -+ case 0x0085: -+ case 0x2028: -+ case 0x2029: -+ break; -+ } -+ } -+ break; -+ - case OP_NOT_DIGIT: - for (i = 1; i <= min; i++) - { -@@ -2692,7 +2821,8 @@ - #endif /* SUPPORT_UTF8 */ - - /* Code for the non-UTF-8 case for minimum matching of operators other -- than OP_PROP and OP_NOTPROP. */ -+ than OP_PROP and OP_NOTPROP. We can assume that there are the minimum -+ number of bytes present, as this was tested above. */ - - switch(ctype) - { -@@ -2701,8 +2831,7 @@ - { - for (i = 1; i <= min; i++) - { -- if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)) -- RRETURN(MATCH_NOMATCH); -+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - eptr++; - } - } -@@ -2713,6 +2842,28 @@ - eptr += min; - break; - -+ /* Because of the CRLF case, we can't assume the minimum number of -+ bytes are present in this case. */ -+ -+ case OP_ANYNL: -+ for (i = 1; i <= min; i++) -+ { -+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); -+ switch(*eptr++) -+ { -+ default: RRETURN(MATCH_NOMATCH); -+ case 0x000d: -+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; -+ break; -+ case 0x000a: -+ case 0x000b: -+ case 0x000c: -+ case 0x0085: -+ break; -+ } -+ } -+ break; -+ - case OP_NOT_DIGIT: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); -@@ -2774,7 +2925,7 @@ - GETCHARINC(c, eptr); - if (prop_fail_result) RRETURN(MATCH_NOMATCH); - } -- break; -+ /* Control never gets here */ - - case PT_LAMP: - for (fi = min;; fi++) -@@ -2789,7 +2940,7 @@ - prop_chartype == ucp_Lt) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } -- break; -+ /* Control never gets here */ - - case PT_GC: - for (fi = min;; fi++) -@@ -2802,7 +2953,7 @@ - if ((prop_category == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } -- break; -+ /* Control never gets here */ - - case PT_PC: - for (fi = min;; fi++) -@@ -2815,7 +2966,7 @@ - if ((prop_chartype == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } -- break; -+ /* Control never gets here */ - - case PT_SC: - for (fi = min;; fi++) -@@ -2828,11 +2979,10 @@ - if ((prop_script == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } -- break; -+ /* Control never gets here */ - - default: - RRETURN(PCRE_ERROR_INTERNAL); -- break; - } - } - -@@ -2876,7 +3026,7 @@ - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || - (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 && -- eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) -+ IS_NEWLINE(eptr))) - RRETURN(MATCH_NOMATCH); - - GETCHARINC(c, eptr); -@@ -2888,6 +3038,23 @@ - case OP_ANYBYTE: - break; - -+ case OP_ANYNL: -+ switch(c) -+ { -+ default: RRETURN(MATCH_NOMATCH); -+ case 0x000d: -+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; -+ break; -+ case 0x000a: -+ case 0x000b: -+ case 0x000c: -+ case 0x0085: -+ case 0x2028: -+ case 0x2029: -+ break; -+ } -+ break; -+ - case OP_NOT_DIGIT: - if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); -@@ -2932,8 +3099,7 @@ - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || -- ((ims & PCRE_DOTALL) == 0 && -- eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) -+ ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) - RRETURN(MATCH_NOMATCH); - - c = *eptr++; -@@ -2945,6 +3111,21 @@ - case OP_ANYBYTE: - break; - -+ case OP_ANYNL: -+ switch(c) -+ { -+ default: RRETURN(MATCH_NOMATCH); -+ case 0x000d: -+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; -+ break; -+ case 0x000a: -+ case 0x000b: -+ case 0x000c: -+ case 0x0085: -+ break; -+ } -+ break; -+ - case OP_NOT_DIGIT: - if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); - break; -@@ -2977,7 +3158,7 @@ - /* Control never gets here */ - } - -- /* If maximizing it is worth using inline code for speed, doing the type -+ /* If maximizing, it is worth using inline code for speed, doing the type - test once at the start (i.e. keep it out of the loop). Again, keep the - UTF-8 and UCP stuff separate. */ - -@@ -3058,6 +3239,7 @@ - - /* eptr is now past the end of the maximum run */ - -+ if (possessive) continue; - for(;;) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -3093,6 +3275,7 @@ - - /* eptr is now past the end of the maximum run */ - -+ if (possessive) continue; - for(;;) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -3135,9 +3318,7 @@ - { - for (i = min; i < max; i++) - { -- if (eptr >= md->end_subject || -- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) -- break; -+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } -@@ -3161,9 +3342,7 @@ - { - for (i = min; i < max; i++) - { -- if (eptr >= md->end_subject || -- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) -- break; -+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - } - break; -@@ -3171,7 +3350,8 @@ - else - { - c = max - min; -- if (c > md->end_subject - eptr) c = md->end_subject - eptr; -+ if (c > (unsigned int)(md->end_subject - eptr)) -+ c = md->end_subject - eptr; - eptr += c; - } - } -@@ -3181,10 +3361,32 @@ - - case OP_ANYBYTE: - c = max - min; -- if (c > md->end_subject - eptr) c = md->end_subject - eptr; -+ if (c > (unsigned int)(md->end_subject - eptr)) -+ c = md->end_subject - eptr; - eptr += c; - break; - -+ case OP_ANYNL: -+ for (i = min; i < max; i++) -+ { -+ int len = 1; -+ if (eptr >= md->end_subject) break; -+ GETCHARLEN(c, eptr, len); -+ if (c == 0x000d) -+ { -+ if (++eptr >= md->end_subject) break; -+ if (*eptr == 0x000a) eptr++; -+ } -+ else -+ { -+ if (c != 0x000a && c != 0x000b && c != 0x000c && -+ c != 0x0085 && c != 0x2028 && c != 0x2029) -+ break; -+ eptr += len; -+ } -+ } -+ break; -+ - case OP_NOT_DIGIT: - for (i = min; i < max; i++) - { -@@ -3257,6 +3459,7 @@ - - /* eptr is now past the end of the maximum run */ - -+ if (possessive) continue; - for(;;) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -3277,9 +3480,7 @@ - { - for (i = min; i < max; i++) - { -- if (eptr >= md->end_subject || -- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) -- break; -+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - } - break; -@@ -3288,10 +3489,30 @@ - - case OP_ANYBYTE: - c = max - min; -- if (c > md->end_subject - eptr) c = md->end_subject - eptr; -+ if (c > (unsigned int)(md->end_subject - eptr)) -+ c = md->end_subject - eptr; - eptr += c; - break; - -+ case OP_ANYNL: -+ for (i = min; i < max; i++) -+ { -+ if (eptr >= md->end_subject) break; -+ c = *eptr; -+ if (c == 0x000d) -+ { -+ if (++eptr >= md->end_subject) break; -+ if (*eptr == 0x000a) eptr++; -+ } -+ else -+ { -+ if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085) -+ break; -+ eptr++; -+ } -+ } -+ break; -+ - case OP_NOT_DIGIT: - for (i = min; i < max; i++) - { -@@ -3352,6 +3573,7 @@ - - /* eptr is now past the end of the maximum run */ - -+ if (possessive) continue; - while (eptr >= pp) - { - RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); -@@ -3366,14 +3588,12 @@ - } - /* Control never gets here */ - -- /* There's been some horrible disaster. Since all codes > OP_BRA are -- for capturing brackets, and there shouldn't be any gaps between 0 and -- OP_BRA, arrival here can only mean there is something seriously wrong -- in the code above or the OP_xxx definitions. */ -+ /* There's been some horrible disaster. Arrival here can only mean there is -+ something seriously wrong in the code above or the OP_xxx definitions. */ - - default: - DPRINTF(("Unknown opcode %d\n", *ecode)); -- RRETURN(PCRE_ERROR_UNKNOWN_NODE); -+ RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); - } - - /* Do not stick any code in here without much thought; it is assumed -@@ -3411,7 +3631,6 @@ - - #undef cur_is_word - #undef condition --#undef minimize - #undef prev_is_word - - #undef original_ims -@@ -3484,6 +3703,7 @@ - BOOL firstline; - BOOL first_byte_caseless = FALSE; - BOOL req_byte_caseless = FALSE; -+BOOL utf8; - match_data match_block; - match_data *md = &match_block; - const uschar *tables; -@@ -3491,6 +3711,7 @@ - USPTR start_match = (USPTR)subject + start_offset; - USPTR end_subject; - USPTR req_byte_ptr = start_match - 1; -+eptrblock eptrchain[EPTR_WORK_SIZE]; - - pcre_study_data internal_study; - const pcre_study_data *study; -@@ -3567,7 +3788,7 @@ - end_subject = md->end_subject; - - md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; --md->utf8 = (re->options & PCRE_UTF8) != 0; -+utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; - - md->notbol = (options & PCRE_NOTBOL) != 0; - md->noteol = (options & PCRE_NOTEOL) != 0; -@@ -3576,6 +3797,7 @@ - md->hitend = FALSE; - - md->recursive = NULL; /* No recursion at top level */ -+md->eptrchain = eptrchain; /* Make workspace generally available */ - - md->lcc = tables + lcc_offset; - md->ctypes = tables + ctypes_offset; -@@ -3583,26 +3805,36 @@ - /* Handle different types of newline. The two bits give four cases. If nothing - is set at run time, whatever was used at compile time applies. */ - --switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) & -- PCRE_NEWLINE_CRLF) -+switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) & -+ PCRE_NEWLINE_BITS) - { -- default: newline = NEWLINE; break; /* Compile-time default */ -+ case 0: newline = NEWLINE; break; /* Compile-time default */ - case PCRE_NEWLINE_CR: newline = '\r'; break; - case PCRE_NEWLINE_LF: newline = '\n'; break; - case PCRE_NEWLINE_CR+ - PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; -+ case PCRE_NEWLINE_ANY: newline = -1; break; -+ default: return PCRE_ERROR_BADNEWLINE; - } - --if (newline > 255) -+if (newline < 0) - { -- md->nllen = 2; -- md->nl[0] = (newline >> 8) & 255; -- md->nl[1] = newline & 255; -+ md->nltype = NLTYPE_ANY; - } - else - { -- md->nllen = 1; -- md->nl[0] = newline; -+ md->nltype = NLTYPE_FIXED; -+ if (newline > 255) -+ { -+ md->nllen = 2; -+ md->nl[0] = (newline >> 8) & 255; -+ md->nl[1] = newline & 255; -+ } -+ else -+ { -+ md->nllen = 1; -+ md->nl[0] = newline; -+ } - } - - /* Partial matching is supported only for a restricted set of regexes at the -@@ -3615,7 +3847,7 @@ - back the character offset. */ - - #ifdef SUPPORT_UTF8 --if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) -+if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) - { - if (_pcre_valid_utf8((uschar *)subject, length) >= 0) - return PCRE_ERROR_BADUTF8; -@@ -3707,10 +3939,13 @@ - req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ - } - -+ -+/* ==========================================================================*/ -+ - /* Loop for handling unanchored repeated matching attempts; for anchored regexs - the loop runs just once. */ - --do -+for(;;) - { - USPTR save_end_subject = end_subject; - -@@ -3725,14 +3960,14 @@ - - /* Advance to a unique first char if possible. If firstline is TRUE, the - start of the match is constrained to the first line of a multiline string. -- Implement this by temporarily adjusting end_subject so that we stop scanning -- at a newline. If the match fails at the newline, later code breaks this loop. -- */ -+ That is, the match must be before or at the first newline. Implement this by -+ temporarily adjusting end_subject so that we stop scanning at a newline. If -+ the match fails at the newline, later code breaks this loop. */ - - if (firstline) - { - USPTR t = start_match; -- while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++; -+ while (t < md->end_subject && !IS_NEWLINE(t)) t++; - end_subject = t; - } - -@@ -3753,11 +3988,9 @@ - - else if (startline) - { -- if (start_match >= md->start_subject + md->nllen + -- start_offset) -+ if (start_match > md->start_subject + start_offset) - { -- while (start_match <= end_subject && -- !IS_NEWLINE(start_match - md->nllen)) -+ while (start_match <= end_subject && !WAS_NEWLINE(start_match)) - start_match++; - } - } -@@ -3793,8 +4026,8 @@ - - HOWEVER: when the subject string is very, very long, searching to its end can - take a long time, and give bad performance on quite ordinary patterns. This -- showed up when somebody was matching /^C/ on a 32-megabyte string... so we -- don't do this when the string is sufficiently long. -+ showed up when somebody was matching something like /^\d+C/ on a 32-megabyte -+ string... so we don't do this when the string is sufficiently long. - - ALSO: this processing is disabled when partial matching is requested. - */ -@@ -3826,9 +4059,14 @@ - } - } - -- /* If we can't find the required character, break the matching loop */ -+ /* If we can't find the required character, break the matching loop, -+ forcing a match failure. */ - -- if (p >= end_subject) break; -+ if (p >= end_subject) -+ { -+ rc = MATCH_NOMATCH; -+ break; -+ } - - /* If we have found the required character, save the point where we - found it, so that we don't search again next time round the loop if -@@ -3838,49 +4076,70 @@ - } - } - -- /* When a match occurs, substrings will be set for all internal extractions; -- we just need to set up the whole thing as substring 0 before returning. If -- there were too many extractions, set the return code to zero. In the case -- where we had to get some local store to hold offsets for backreferences, copy -- those back references that we can. In this case there need not be overflow -- if certain parts of the pattern were not used. */ -+ /* OK, we can now run the match. */ - - md->start_match = start_match; - md->match_call_count = 0; -+ md->eptrn = 0; /* Next free eptrchain slot */ -+ rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0); - -- rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0); -+ /* Any return other than MATCH_NOMATCH breaks the loop. */ - -- /* When the result is no match, if the subject's first character was a -- newline and the PCRE_FIRSTLINE option is set, break (which will return -- PCRE_ERROR_NOMATCH). The option requests that a match occur before the first -- newline in the subject. Otherwise, advance the pointer to the next character -- and continue - but the continuation will actually happen only when the -- pattern is not anchored. */ -+ if (rc != MATCH_NOMATCH) break; - -- if (rc == MATCH_NOMATCH) -- { -- if (firstline && -- start_match <= md->end_subject - md->nllen && -- IS_NEWLINE(start_match)) -- break; -- start_match++; -+ /* If PCRE_FIRSTLINE is set, the match must happen before or at the first -+ newline in the subject (though it may continue over the newline). Therefore, -+ if we have just failed to match, starting at a newline, do not continue. */ -+ -+ if (firstline && IS_NEWLINE(start_match)) break; -+ -+ /* Advance the match position by one character. */ -+ -+ start_match++; - #ifdef SUPPORT_UTF8 -- if (md->utf8) -- while(start_match < end_subject && (*start_match & 0xc0) == 0x80) -- start_match++; -+ if (utf8) -+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80) -+ start_match++; - #endif -- continue; -- } - -- if (rc != MATCH_MATCH) -- { -- DPRINTF((">>>> error: returning %d\n", rc)); -- return rc; -- } -+ /* Break the loop if the pattern is anchored or if we have passed the end of -+ the subject. */ -+ -+ if (anchored || start_match > end_subject) break; -+ -+ /* If we have just passed a CR and the newline option is CRLF or ANY, and we -+ are now at a LF, advance the match position by one more character. */ -+ -+ if (start_match[-1] == '\r' && -+ (md->nltype == NLTYPE_ANY || md->nllen == 2) && -+ start_match < end_subject && -+ *start_match == '\n') -+ start_match++; -+ -+ } /* End of for(;;) "bumpalong" loop */ -+ -+/* ==========================================================================*/ -+ -+/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping -+conditions is true: - -- /* We have a match! Copy the offset information from temporary store if -- necessary */ -+(1) The pattern is anchored; - -+(2) We are past the end of the subject; -+ -+(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because -+ this option requests that a match occur at or before the first newline in -+ the subject. -+ -+When we have a match and the offset vector is big enough to deal with any -+backreferences, captured substring offsets will already be set up. In the case -+where we had to get some local store to hold offsets for backreference -+processing, copy those that we can. In this case there need not be overflow if -+certain parts of the pattern were not used, even though there are more -+capturing parentheses than vector slots. */ -+ -+if (rc == MATCH_MATCH) -+ { - if (using_temporary_offsets) - { - if (offsetcount >= 4) -@@ -3889,15 +4148,18 @@ - (offsetcount - 2) * sizeof(int)); - DPRINTF(("Copied offsets from temporary memory\n")); - } -- if (md->end_offset_top > offsetcount) -- md->offset_overflow = TRUE; -- -+ if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE; - DPRINTF(("Freeing temporary memory\n")); - (pcre_free)(md->offset_vector); - } - -+ /* Set the return code to the number of captured strings, or 0 if there are -+ too many to fit into the vector. */ -+ - rc = md->offset_overflow? 0 : md->end_offset_top/2; - -+ /* If there is space, set up the whole thing as substring 0. */ -+ - if (offsetcount < 2) rc = 0; else - { - offsets[0] = start_match - md->start_subject; -@@ -3908,9 +4170,8 @@ - return rc; - } - --/* This "while" is the end of the "do" above */ -- --while (!anchored && start_match <= end_subject); -+/* Control gets here if there has been an error, or if the overall match -+attempt has failed at all permitted starting positions. */ - - if (using_temporary_offsets) - { -@@ -3918,7 +4179,12 @@ - (pcre_free)(md->offset_vector); - } - --if (md->partial && md->hitend) -+if (rc != MATCH_NOMATCH) -+ { -+ DPRINTF((">>>> error: returning %d\n", rc)); -+ return rc; -+ } -+else if (md->partial && md->hitend) - { - DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); - return PCRE_ERROR_PARTIAL; -diff -ruN ../pcre.orig/pcrelib/pcre_globals.c ./pcrelib/pcre_globals.c ---- ../pcre.orig/pcrelib/pcre_globals.c Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/pcre_globals.c Fri Feb 9 22:31:19 2007 -@@ -6,7 +6,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -51,6 +51,18 @@ - - - #ifndef VPCOMPAT -+ -+/************************************************************************** -+This code used to be here for use when compiling as a C++ library. However, -+according to Dair Grant it is not needed: " -+ -+ Including 'extern "C"' in the declaration generates an "initialized and -+ declared `extern'" warning from gcc 4.0.1. Since we include pcre_internal.h, -+ which includes pcre.h, which declares these prototypes within an extern "C" {} -+ block, we shouldn't need the prefix here. -+ -+So, from Release 7.0 I have cut this out. -+ - #ifdef __cplusplus - extern "C" void *(*pcre_malloc)(size_t) = malloc; - extern "C" void (*pcre_free)(void *) = free; -@@ -58,12 +70,13 @@ - extern "C" void (*pcre_stack_free)(void *) = free; - extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL; - #else -+**************************************************************************/ -+ - void *(*pcre_malloc)(size_t) = malloc; - void (*pcre_free)(void *) = free; - void *(*pcre_stack_malloc)(size_t) = malloc; - void (*pcre_stack_free)(void *) = free; - int (*pcre_callout)(pcre_callout_block *) = NULL; --#endif - #endif - - /* End of pcre_globals.c */ -diff -ruN ../pcre.orig/pcrelib/pcre_internal.h ./pcrelib/pcre_internal.h ---- ../pcre.orig/pcrelib/pcre_internal.h Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/pcre_internal.h Fri Feb 9 22:31:20 2007 -@@ -7,7 +7,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -54,12 +54,16 @@ - /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef - inline, and there are *still* stupid compilers about that don't like indented - pre-processor statements, or at least there were when I first wrote this. After --all, it had only been about 10 years then... */ -+all, it had only been about 10 years then... - -+It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so -+be absolutely sure we get our version. */ -+ -+#undef DPRINTF - #ifdef DEBUG - #define DPRINTF(p) printf p - #else --#define DPRINTF(p) /*nothing*/ -+#define DPRINTF(p) /* Nothing */ - #endif - - -@@ -118,13 +122,48 @@ - - typedef unsigned char uschar; - --/* PCRE is able to support 3 different kinds of newline (CR, LF, CRLF). The --following macro is used to package up testing for newlines. NLBLOCK is defined --in the various modules to indicate in which datablock the parameters exist. */ -+/* This is an unsigned int value that no character can ever have. UTF-8 -+characters only go up to 0x7fffffff (though Unicode doesn't go beyond -+0x0010ffff). */ -+ -+#define NOTACHAR 0xffffffff -+ -+/* PCRE is able to support several different kinds of newline (CR, LF, CRLF, -+and "all" at present). The following macros are used to package up testing for -+newlines. NLBLOCK, PSSTART, and PSEND are defined in the various modules to -+indicate in which datablock the parameters exist, and what the start/end of -+string field names are. */ -+ -+#define NLTYPE_FIXED 0 /* Newline is a fixed length string */ -+#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ -+ -+/* This macro checks for a newline at the given position */ - - #define IS_NEWLINE(p) \ -- ((p)[0] == NLBLOCK->nl[0] && \ -- (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1])) -+ ((NLBLOCK->nltype != NLTYPE_FIXED)? \ -+ ((p) < NLBLOCK->PSEND && \ -+ _pcre_is_newline((p), NLBLOCK->PSEND, &(NLBLOCK->nllen), utf8) \ -+ ) \ -+ : \ -+ ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ -+ (p)[0] == NLBLOCK->nl[0] && \ -+ (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \ -+ ) \ -+ ) -+ -+/* This macro checks for a newline immediately preceding the given position */ -+ -+#define WAS_NEWLINE(p) \ -+ ((NLBLOCK->nltype != NLTYPE_FIXED)? \ -+ ((p) > NLBLOCK->PSSTART && \ -+ _pcre_was_newline((p), NLBLOCK->PSSTART, &(NLBLOCK->nllen), utf8) \ -+ ) \ -+ : \ -+ ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ -+ (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ -+ (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \ -+ ) \ -+ ) - - /* When PCRE is compiled as a C++ library, the subject pointer can be replaced - with a custom type. This makes it possible, for example, to allow pcre_exec() -@@ -282,7 +321,7 @@ - - #define GETCHAR(c, eptr) \ - c = *eptr; \ -- if ((c & 0xc0) == 0xc0) \ -+ if (c >= 0xc0) \ - { \ - int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ -@@ -300,7 +339,7 @@ - - #define GETCHARTEST(c, eptr) \ - c = *eptr; \ -- if (utf8 && (c & 0xc0) == 0xc0) \ -+ if (utf8 && c >= 0xc0) \ - { \ - int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ -@@ -318,7 +357,7 @@ - - #define GETCHARINC(c, eptr) \ - c = *eptr++; \ -- if ((c & 0xc0) == 0xc0) \ -+ if (c >= 0xc0) \ - { \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ - int gcss = 6*gcaa; \ -@@ -334,7 +373,7 @@ - - #define GETCHARINCTEST(c, eptr) \ - c = *eptr++; \ -- if (utf8 && (c & 0xc0) == 0xc0) \ -+ if (utf8 && c >= 0xc0) \ - { \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ - int gcss = 6*gcaa; \ -@@ -351,7 +390,7 @@ - - #define GETCHARLEN(c, eptr, len) \ - c = *eptr; \ -- if ((c & 0xc0) == 0xc0) \ -+ if (c >= 0xc0) \ - { \ - int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ -@@ -404,20 +443,21 @@ - /* Masks for identifying the public options that are permitted at compile - time, run time, or study time, respectively. */ - -+#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY) -+ - #define PUBLIC_OPTIONS \ - (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ - PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ - PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ -- PCRE_DUPNAMES|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF) -+ PCRE_DUPNAMES|PCRE_NEWLINE_BITS) - - #define PUBLIC_EXEC_OPTIONS \ - (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ -- PCRE_PARTIAL|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF) -+ PCRE_PARTIAL|PCRE_NEWLINE_BITS) - - #define PUBLIC_DFA_EXEC_OPTIONS \ - (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ -- PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_CR| \ -- PCRE_NEWLINE_LF) -+ PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS) - - #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ - -@@ -449,9 +489,7 @@ - #define FALSE 0 - #define TRUE 1 - --/* Escape items that are just an encoding of a particular data value. Note that --ESC_n is defined as yet another macro, which is set in config.h to either \n --(the default) or \r (which some people want). */ -+/* Escape items that are just an encoding of a particular data value. */ - - #ifndef ESC_e - #define ESC_e 27 -@@ -462,7 +500,7 @@ - #endif - - #ifndef ESC_n --#define ESC_n NEWLINE -+#define ESC_n '\n' - #endif - - #ifndef ESC_r -@@ -501,21 +539,28 @@ - their negation. Also, they must appear in the same order as in the opcode - definitions below, up to ESC_z. There's a dummy for OP_ANY because it - corresponds to "." rather than an escape sequence. The final one must be --ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two --tests in the code for an escape greater than ESC_b and less than ESC_Z to --detect the types that may be repeated. These are the types that consume --characters. If any new escapes are put in between that don't consume a -+ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc). -+There are two tests in the code for an escape greater than ESC_b and less than -+ESC_Z to detect the types that may be repeated. These are the types that -+consume characters. If any new escapes are put in between that don't consume a - character, that code will have to change. */ - - enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, -- ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E, -- ESC_Q, ESC_REF }; -+ ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_X, ESC_Z, ESC_z, -+ ESC_E, ESC_Q, ESC_k, ESC_REF }; -+ - - /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets - that extract substrings. Starting from 1 (i.e. after OP_END), the values up to - OP_EOD must correspond in order to the list of escapes immediately above. --Note that whenever this list is updated, the two macro definitions that follow --must also be updated to match. */ -+ -+To keep stored, compiled patterns compatible, new opcodes should be added -+immediately before OP_BRA, where (since release 7.0) a gap is left for this -+purpose. -+ -+*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions -+that follow must also be updated to match. There is also a table called -+"coptable" in pcre_dfa_exec.c that must be updated. */ - - enum { - OP_END, /* 0 End of pattern */ -@@ -536,110 +581,122 @@ - OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */ - OP_NOTPROP, /* 13 \P (not Unicode property) */ - OP_PROP, /* 14 \p (Unicode property) */ -- OP_EXTUNI, /* 15 \X (extended Unicode sequence */ -- OP_EODN, /* 16 End of data or \n at end of data: \Z. */ -- OP_EOD, /* 17 End of data: \z */ -- -- OP_OPT, /* 18 Set runtime options */ -- OP_CIRC, /* 19 Start of line - varies with multiline switch */ -- OP_DOLL, /* 20 End of line - varies with multiline switch */ -- OP_CHAR, /* 21 Match one character, casefully */ -- OP_CHARNC, /* 22 Match one character, caselessly */ -- OP_NOT, /* 23 Match one character, not the following one */ -- -- OP_STAR, /* 24 The maximizing and minimizing versions of */ -- OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */ -- OP_PLUS, /* 26 the minimizing one second. */ -- OP_MINPLUS, /* 27 This first set applies to single characters */ -- OP_QUERY, /* 28 */ -- OP_MINQUERY, /* 29 */ -- OP_UPTO, /* 30 From 0 to n matches */ -- OP_MINUPTO, /* 31 */ -- OP_EXACT, /* 32 Exactly n matches */ -- -- OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */ -- OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */ -- OP_NOTPLUS, /* 35 the minimizing one second. */ -- OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */ -- OP_NOTQUERY, /* 37 */ -- OP_NOTMINQUERY, /* 38 */ -- OP_NOTUPTO, /* 39 From 0 to n matches */ -- OP_NOTMINUPTO, /* 40 */ -- OP_NOTEXACT, /* 41 Exactly n matches */ -- -- OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */ -- OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */ -- OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */ -- OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */ -- OP_TYPEQUERY, /* 46 This set applies to character types such as \d */ -- OP_TYPEMINQUERY, /* 47 */ -- OP_TYPEUPTO, /* 48 From 0 to n matches */ -- OP_TYPEMINUPTO, /* 49 */ -- OP_TYPEEXACT, /* 50 Exactly n matches */ -- -- OP_CRSTAR, /* 51 The maximizing and minimizing versions of */ -- OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */ -- OP_CRPLUS, /* 53 the minimizing one second. These codes must */ -- OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */ -- OP_CRQUERY, /* 55 These are for character classes and back refs */ -- OP_CRMINQUERY, /* 56 */ -- OP_CRRANGE, /* 57 These are different to the three sets above. */ -- OP_CRMINRANGE, /* 58 */ -+ OP_ANYNL, /* 15 \R (any newline sequence) */ -+ OP_EXTUNI, /* 16 \X (extended Unicode sequence */ -+ OP_EODN, /* 17 End of data or \n at end of data: \Z. */ -+ OP_EOD, /* 18 End of data: \z */ -+ -+ OP_OPT, /* 19 Set runtime options */ -+ OP_CIRC, /* 20 Start of line - varies with multiline switch */ -+ OP_DOLL, /* 21 End of line - varies with multiline switch */ -+ OP_CHAR, /* 22 Match one character, casefully */ -+ OP_CHARNC, /* 23 Match one character, caselessly */ -+ OP_NOT, /* 24 Match one character, not the following one */ -+ -+ OP_STAR, /* 25 The maximizing and minimizing versions of */ -+ OP_MINSTAR, /* 26 these six opcodes must come in pairs, with */ -+ OP_PLUS, /* 27 the minimizing one second. */ -+ OP_MINPLUS, /* 28 This first set applies to single characters.*/ -+ OP_QUERY, /* 29 */ -+ OP_MINQUERY, /* 30 */ -+ -+ OP_UPTO, /* 31 From 0 to n matches */ -+ OP_MINUPTO, /* 32 */ -+ OP_EXACT, /* 33 Exactly n matches */ -+ -+ OP_POSSTAR, /* 34 Possessified star */ -+ OP_POSPLUS, /* 35 Possessified plus */ -+ OP_POSQUERY, /* 36 Posesssified query */ -+ OP_POSUPTO, /* 37 Possessified upto */ -+ -+ OP_NOTSTAR, /* 38 The maximizing and minimizing versions of */ -+ OP_NOTMINSTAR, /* 39 these six opcodes must come in pairs, with */ -+ OP_NOTPLUS, /* 40 the minimizing one second. They must be in */ -+ OP_NOTMINPLUS, /* 41 exactly the same order as those above. */ -+ OP_NOTQUERY, /* 42 This set applies to "not" single characters. */ -+ OP_NOTMINQUERY, /* 43 */ -+ -+ OP_NOTUPTO, /* 44 From 0 to n matches */ -+ OP_NOTMINUPTO, /* 45 */ -+ OP_NOTEXACT, /* 46 Exactly n matches */ -+ -+ OP_NOTPOSSTAR, /* 47 Possessified versions */ -+ OP_NOTPOSPLUS, /* 48 */ -+ OP_NOTPOSQUERY, /* 49 */ -+ OP_NOTPOSUPTO, /* 50 */ -+ -+ OP_TYPESTAR, /* 51 The maximizing and minimizing versions of */ -+ OP_TYPEMINSTAR, /* 52 these six opcodes must come in pairs, with */ -+ OP_TYPEPLUS, /* 53 the minimizing one second. These codes must */ -+ OP_TYPEMINPLUS, /* 54 be in exactly the same order as those above. */ -+ OP_TYPEQUERY, /* 55 This set applies to character types such as \d */ -+ OP_TYPEMINQUERY, /* 56 */ -+ -+ OP_TYPEUPTO, /* 57 From 0 to n matches */ -+ OP_TYPEMINUPTO, /* 58 */ -+ OP_TYPEEXACT, /* 59 Exactly n matches */ -+ -+ OP_TYPEPOSSTAR, /* 60 Possessified versions */ -+ OP_TYPEPOSPLUS, /* 61 */ -+ OP_TYPEPOSQUERY, /* 62 */ -+ OP_TYPEPOSUPTO, /* 63 */ -+ -+ OP_CRSTAR, /* 64 The maximizing and minimizing versions of */ -+ OP_CRMINSTAR, /* 65 all these opcodes must come in pairs, with */ -+ OP_CRPLUS, /* 66 the minimizing one second. These codes must */ -+ OP_CRMINPLUS, /* 67 be in exactly the same order as those above. */ -+ OP_CRQUERY, /* 68 These are for character classes and back refs */ -+ OP_CRMINQUERY, /* 69 */ -+ OP_CRRANGE, /* 70 These are different to the three sets above. */ -+ OP_CRMINRANGE, /* 71 */ - -- OP_CLASS, /* 59 Match a character class, chars < 256 only */ -- OP_NCLASS, /* 60 Same, but the bitmap was created from a negative -+ OP_CLASS, /* 72 Match a character class, chars < 256 only */ -+ OP_NCLASS, /* 73 Same, but the bitmap was created from a negative - class - the difference is relevant only when a UTF-8 - character > 255 is encountered. */ - -- OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the -+ OP_XCLASS, /* 74 Extended class for handling UTF-8 chars within the - class. This does both positive and negative. */ - -- OP_REF, /* 62 Match a back reference */ -- OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */ -- OP_CALLOUT, /* 64 Call out to external function if provided */ -- -- OP_ALT, /* 65 Start of alternation */ -- OP_KET, /* 66 End of group that doesn't have an unbounded repeat */ -- OP_KETRMAX, /* 67 These two must remain together and in this */ -- OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */ -- -- /* The assertions must come before ONCE and COND */ -- -- OP_ASSERT, /* 69 Positive lookahead */ -- OP_ASSERT_NOT, /* 70 Negative lookahead */ -- OP_ASSERTBACK, /* 71 Positive lookbehind */ -- OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */ -- OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */ -- -- /* ONCE and COND must come after the assertions, with ONCE first, as there's -- a test for >= ONCE for a subpattern that isn't an assertion. */ -- -- OP_ONCE, /* 74 Once matched, don't back up into the subpattern */ -- OP_COND, /* 75 Conditional group */ -- OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */ -- -- OP_BRAZERO, /* 77 These two must remain together and in this */ -- OP_BRAMINZERO, /* 78 order. */ -- -- OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater -- than can fit into an opcode. */ -- -- OP_BRA /* 80 This and greater values are used for brackets that -- extract substrings up to EXTRACT_BASIC_MAX. After -- that, use is made of OP_BRANUMBER. */ --}; -- --/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and --study.c that all opcodes are less than 128 in value. This makes handling UTF-8 --character sequences easier. */ -- --/* The highest extraction number before we have to start using additional --bytes. (Originally PCRE didn't have support for extraction counts highter than --this number.) The value is limited by the number of opcodes left after OP_BRA, --i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional --opcodes. */ -+ OP_REF, /* 75 Match a back reference */ -+ OP_RECURSE, /* 76 Match a numbered subpattern (possibly recursive) */ -+ OP_CALLOUT, /* 77 Call out to external function if provided */ -+ -+ OP_ALT, /* 78 Start of alternation */ -+ OP_KET, /* 79 End of group that doesn't have an unbounded repeat */ -+ OP_KETRMAX, /* 80 These two must remain together and in this */ -+ OP_KETRMIN, /* 81 order. They are for groups the repeat for ever. */ -+ -+ /* The assertions must come before BRA, CBRA, ONCE, and COND.*/ -+ -+ OP_ASSERT, /* 82 Positive lookahead */ -+ OP_ASSERT_NOT, /* 83 Negative lookahead */ -+ OP_ASSERTBACK, /* 84 Positive lookbehind */ -+ OP_ASSERTBACK_NOT, /* 85 Negative lookbehind */ -+ OP_REVERSE, /* 86 Move pointer back - used in lookbehind assertions */ -+ -+ /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, -+ as there's a test for >= ONCE for a subpattern that isn't an assertion. */ -+ -+ OP_ONCE, /* 87 Atomic group */ -+ OP_BRA, /* 88 Start of non-capturing bracket */ -+ OP_CBRA, /* 89 Start of capturing bracket */ -+ OP_COND, /* 90 Conditional group */ -+ -+ /* These three must follow the previous three, in the same order. There's a -+ check for >= SBRA to distinguish the two sets. */ -+ -+ OP_SBRA, /* 91 Start of non-capturing bracket, check empty */ -+ OP_SCBRA, /* 92 Start of capturing bracket, check empty */ -+ OP_SCOND, /* 93 Conditional group, check empty */ -+ -+ OP_CREF, /* 94 Used to hold a capture number as condition */ -+ OP_RREF, /* 95 Used to hold a recursion number as condition */ -+ OP_DEF, /* 96 The DEFINE condition */ - --#define EXTRACT_BASIC_MAX 100 -+ OP_BRAZERO, /* 97 These two must remain together and in this */ -+ OP_BRAMINZERO /* 98 order. */ -+}; - - - /* This macro defines textual names for all the opcodes. These are used only -@@ -648,17 +705,21 @@ - #define OP_NAME_LIST \ - "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \ - "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ -- "notprop", "prop", "extuni", \ -+ "notprop", "prop", "anynl", "extuni", \ - "\\Z", "\\z", \ - "Opt", "^", "$", "char", "charnc", "not", \ - "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ -+ "*+","++", "?+", "{", \ - "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ -+ "*+","++", "?+", "{", \ - "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ -+ "*+","++", "?+", "{", \ - "*", "*?", "+", "+?", "?", "??", "{", "{", \ - "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ - "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ -- "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\ -- "Brazero", "Braminzero", "Branumber", "Bra" -+ "AssertB", "AssertB not", "Reverse", \ -+ "Once", "Bra 0", "Bra", "Cond", "SBra 0", "SBra", "SCond", \ -+ "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero" - - - /* This macro defines the length of fixed length operations in the compiled -@@ -674,7 +735,7 @@ - 1, /* End */ \ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \ - 1, 1, /* Any, Anybyte */ \ -- 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \ -+ 3, 3, 1, 1, /* NOTPROP, PROP, EXTUNI, ANYNL */ \ - 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ - 2, /* Char - the minimum length */ \ - 2, /* Charnc - the minimum length */ \ -@@ -682,12 +743,15 @@ - /* Positive single-char repeats ** These are */ \ - 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ - 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ -+ 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \ - /* Negative single-char repeats - only for chars < 256 */ \ - 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ - 4, 4, 4, /* NOT upto, minupto, exact */ \ -+ 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \ - /* Positive type repeats */ \ - 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ - 4, 4, 4, /* Type upto, minupto, exact */ \ -+ 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \ - /* Character class & ref repeats */ \ - 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ - 5, 5, /* CRRANGE, CRMINRANGE */ \ -@@ -706,17 +770,22 @@ - 1+LINK_SIZE, /* Assert behind */ \ - 1+LINK_SIZE, /* Assert behind not */ \ - 1+LINK_SIZE, /* Reverse */ \ -- 1+LINK_SIZE, /* Once */ \ -+ 1+LINK_SIZE, /* ONCE */ \ -+ 1+LINK_SIZE, /* BRA */ \ -+ 3+LINK_SIZE, /* CBRA */ \ - 1+LINK_SIZE, /* COND */ \ -+ 1+LINK_SIZE, /* SBRA */ \ -+ 3+LINK_SIZE, /* SCBRA */ \ -+ 1+LINK_SIZE, /* SCOND */ \ - 3, /* CREF */ \ -+ 3, /* RREF */ \ -+ 1, /* DEF */ \ - 1, 1, /* BRAZERO, BRAMINZERO */ \ -- 3, /* BRANUMBER */ \ -- 1+LINK_SIZE /* BRA */ \ - - --/* A magic value for OP_CREF to indicate the "in recursion" condition. */ -+/* A magic value for OP_RREF to indicate the "any recursion" condition. */ - --#define CREF_RECURSE 0xffff -+#define RREF_ANY 0xffff - - /* Error code numbers. They are given names so that they can more easily be - tracked. */ -@@ -726,7 +795,7 @@ - ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, - ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, - ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, -- ERR50, ERR51 }; -+ ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57 }; - - /* The real format of the start of the pcre block; the index of names and the - code vector run on as long as necessary after the end. We store an explicit -@@ -781,17 +850,23 @@ - const uschar *fcc; /* Points to case-flipping table */ - const uschar *cbits; /* Points to character type table */ - const uschar *ctypes; /* Points to table of type maps */ -+ const uschar *start_workspace;/* The start of working space */ - const uschar *start_code; /* The start of the compiled code */ - const uschar *start_pattern; /* The start of the pattern */ -+ const uschar *end_pattern; /* The end of the pattern */ -+ uschar *hwm; /* High watermark of workspace */ - uschar *name_table; /* The name/number table */ - int names_found; /* Number of entries so far */ - int name_entry_size; /* Size of each entry */ -+ int bracount; /* Count of capturing parens */ - int top_backref; /* Maximum back reference */ - unsigned int backref_map; /* Bitmap of low back refs */ -+ int external_options; /* External (initial) options */ - int req_varyopt; /* "After variable item" flag for reqbyte */ - BOOL nopartial; /* Set TRUE if partial won't work */ -- int nllen; /* 1 or 2 for newline string length */ -- uschar nl[4]; /* Newline string */ -+ int nltype; /* Newline type */ -+ int nllen; /* Newline string length */ -+ uschar nl[4]; /* Newline string when fixed length */ - } compile_data; - - /* Structure for maintaining a chain of pointers to the currently incomplete -@@ -824,6 +899,16 @@ - - struct heapframe; - -+/* Structure for building a chain of data for holding the values of the subject -+pointer at the start of each subpattern, so as to detect when an empty string -+has been matched by a subpattern - to break infinite loops. */ -+ -+typedef struct eptrblock { -+ struct eptrblock *epb_prev; -+ USPTR epb_saved_eptr; -+} eptrblock; -+ -+ - /* Structure for passing "static" information around between the functions - doing traditional NFA matching, so that they are thread-safe. */ - -@@ -834,8 +919,9 @@ - int *offset_vector; /* Offset vector */ - int offset_end; /* One past the end */ - int offset_max; /* The maximum usable for return data */ -- int nllen; /* 1 or 2 for newline string length */ -- uschar nl[4]; /* Newline string */ -+ int nltype; /* Newline type */ -+ int nllen; /* Newline string length */ -+ uschar nl[4]; /* Newline string when fixed */ - const uschar *lcc; /* Points to lower casing table */ - const uschar *ctypes; /* Points to table of type maps */ - BOOL offset_overflow; /* Set if too many extractions */ -@@ -854,6 +940,8 @@ - int end_offset_top; /* Highwater mark at end of match */ - int capture_last; /* Most recent capture number */ - int start_offset; /* The start offset value */ -+ eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ -+ int eptrn; /* Next free eptrblock */ - recursion_info *recursive; /* Linked list of recursion data */ - void *callout_data; /* To pass back to callouts */ - struct heapframe *thisframe; /* Used only when compiling for no recursion */ -@@ -869,8 +957,9 @@ - const uschar *tables; /* Character tables */ - int moptions; /* Match options */ - int poptions; /* Pattern options */ -- int nllen; /* 1 or 2 for newline string length */ -- uschar nl[4]; /* Newline string */ -+ int nltype; /* Newline type */ -+ int nllen; /* Newline string length */ -+ uschar nl[4]; /* Newline string when fixed */ - void *callout_data; /* To pass back to callouts */ - } dfa_match_data; - -@@ -941,13 +1030,17 @@ - one of the exported public functions. They have to be "external" in the C - sense, but are not part of the PCRE public API. */ - --extern int _pcre_ord2utf8(int, uschar *); --extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *, -- const pcre_study_data *, pcre_study_data *); --extern int _pcre_ucp_findprop(const unsigned int, int *, int *); --extern int _pcre_ucp_othercase(const int); --extern int _pcre_valid_utf8(const uschar *, int); --extern BOOL _pcre_xclass(int, const uschar *); -+extern BOOL _pcre_is_newline(const uschar *, const uschar *, int *, -+ BOOL); -+extern int _pcre_ord2utf8(int, uschar *); -+extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, -+ const pcre_study_data *, pcre_study_data *); -+extern int _pcre_ucp_findprop(const unsigned int, int *, int *); -+extern unsigned int _pcre_ucp_othercase(const unsigned int); -+extern int _pcre_valid_utf8(const uschar *, int); -+extern BOOL _pcre_was_newline(const uschar *, const uschar *, int *, -+ BOOL); -+extern BOOL _pcre_xclass(int, const uschar *); - - #endif - -diff -ruN ../pcre.orig/pcrelib/pcre_maketables.c ./pcrelib/pcre_maketables.c ---- ../pcre.orig/pcrelib/pcre_maketables.c Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/pcre_maketables.c Fri Feb 9 22:31:20 2007 -@@ -6,7 +6,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -130,7 +130,7 @@ - meta-character, which in this sense is any character that terminates a run - of data characters. */ - -- if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta; -+ if (strchr("\\*+?{^.$|()[", i) != 0) x += ctype_meta; - *p++ = x; - } - -diff -ruN ../pcre.orig/pcrelib/pcre_newline.c ./pcrelib/pcre_newline.c ---- ../pcre.orig/pcrelib/pcre_newline.c Thu Jan 1 01:00:00 1970 -+++ ./pcrelib/pcre_newline.c Fri Feb 9 20:48:47 2007 -@@ -0,0 +1,135 @@ -+/************************************************* -+* Perl-Compatible Regular Expressions * -+*************************************************/ -+ -+/* PCRE is a library of functions to support regular expressions whose syntax -+and semantics are as close as possible to those of the Perl 5 language. -+ -+ Written by Philip Hazel -+ Copyright (c) 1997-2006 University of Cambridge -+ -+----------------------------------------------------------------------------- -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ -+ * Redistributions of source code must retain the above copyright notice, -+ this list of conditions and the following disclaimer. -+ -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ -+ * Neither the name of the University of Cambridge nor the names of its -+ contributors may be used to endorse or promote products derived from -+ this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -+POSSIBILITY OF SUCH DAMAGE. -+----------------------------------------------------------------------------- -+*/ -+ -+ -+/* This module contains internal functions for testing newlines when more than -+one kind of newline is to be recognized. When a newline is found, its length is -+returned. In principle, we could implement several newline "types", each -+referring to a different set of newline characters. At present, PCRE supports -+only NLTYPE_FIXED, which gets handled without these functions, and NLTYPE_ALL, -+so for now the type isn't passed into the functions. It can easily be added -+later if required. The full list of Unicode newline characters is taken from -+http://unicode.org/unicode/reports/tr18/. */ -+ -+ -+#include "pcre_internal.h" -+ -+ -+ -+/************************************************* -+* Check for newline at given position * -+*************************************************/ -+ -+/* It is guaranteed that the initial value of ptr is less than the end of the -+string that is being processed. -+ -+Arguments: -+ ptr pointer to possible newline -+ endptr pointer to the end of the string -+ lenptr where to return the length -+ utf8 TRUE if in utf8 mode -+ -+Returns: TRUE or FALSE -+*/ -+ -+BOOL -+_pcre_is_newline(const uschar *ptr, const uschar *endptr, int *lenptr, -+ BOOL utf8) -+{ -+int c; -+if (utf8) { GETCHAR(c, ptr); } else c = *ptr; -+switch(c) -+ { -+ case 0x000a: /* LF */ -+ case 0x000b: /* VT */ -+ case 0x000c: *lenptr = 1; return TRUE; /* FF */ -+ case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; -+ return TRUE; /* CR */ -+ case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ -+ case 0x2028: /* LS */ -+ case 0x2029: *lenptr = 3; return TRUE; /* PS */ -+ default: return FALSE; -+ } -+} -+ -+ -+ -+/************************************************* -+* Check for newline at previous position * -+*************************************************/ -+ -+/* It is guaranteed that the initial value of ptr is greater than the start of -+the string that is being processed. -+ -+Arguments: -+ ptr pointer to possible newline -+ startptr pointer to the start of the string -+ lenptr where to return the length -+ utf8 TRUE if in utf8 mode -+ -+Returns: TRUE or FALSE -+*/ -+ -+BOOL -+_pcre_was_newline(const uschar *ptr, const uschar *startptr, int *lenptr, -+ BOOL utf8) -+{ -+int c; -+ptr--; -+if (utf8) -+ { -+ BACKCHAR(ptr); -+ GETCHAR(c, ptr); -+ } -+else c = *ptr; -+switch(c) -+ { -+ case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; -+ return TRUE; /* LF */ -+ case 0x000b: /* VT */ -+ case 0x000c: /* FF */ -+ case 0x000d: *lenptr = 1; return TRUE; /* CR */ -+ case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ -+ case 0x2028: /* LS */ -+ case 0x2029: *lenptr = 3; return TRUE; /* PS */ -+ default: return FALSE; -+ } -+} -+ -+/* End of pcre_newline.c */ -diff -ruN ../pcre.orig/pcrelib/pcre_printint.src ./pcrelib/pcre_printint.src ---- ../pcre.orig/pcrelib/pcre_printint.src Wed Aug 30 22:00:22 2006 -+++ ./pcrelib/pcre_printint.src Fri Feb 9 22:31:20 2007 -@@ -49,9 +49,19 @@ - compiled regex for debugging purposes. */ - - -+/* Macro that decides whether a character should be output as a literal or in -+hexadecimal. We don't use isprint() because that can vary from system to system -+(even without the use of locales) and we want the output always to be the same, -+for testing purposes. This macro is used in pcretest as well as in this file. */ -+ -+#define PRINTABLE(c) ((c) >= 32 && (c) < 127) -+ -+/* The table of operator names. */ -+ - static const char *OP_names[] = { OP_NAME_LIST }; - - -+ - /************************************************* - * Print single- or multi-byte character * - *************************************************/ -@@ -63,7 +73,7 @@ - - if (!utf8 || (c & 0xc0) != 0xc0) - { -- if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); -+ if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); - return 0; - } - else -@@ -160,16 +170,6 @@ - - fprintf(f, "%3d ", (int)(code - codestart)); - -- if (*code >= OP_BRA) -- { -- if (*code - OP_BRA > EXTRACT_BASIC_MAX) -- fprintf(f, "%3d Bra extra\n", GET(code, 1)); -- else -- fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA); -- code += _pcre_OP_lengths[OP_BRA]; -- continue; -- } -- - switch(*code) - { - case OP_END: -@@ -203,6 +203,14 @@ - fprintf(f, "\n"); - continue; - -+ case OP_CBRA: -+ case OP_SCBRA: -+ fprintf(f, "%3d %s %d", GET(code, 1), OP_names[*code], -+ GET2(code, 1+LINK_SIZE)); -+ break; -+ -+ case OP_BRA: -+ case OP_SBRA: - case OP_KETRMAX: - case OP_KETRMIN: - case OP_ALT: -@@ -213,33 +221,45 @@ - case OP_ASSERTBACK_NOT: - case OP_ONCE: - case OP_COND: -+ case OP_SCOND: - case OP_REVERSE: - fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]); - break; - -- case OP_BRANUMBER: -- printf("%3d %s", GET2(code, 1), OP_names[*code]); -+ case OP_CREF: -+ fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]); - break; - -- case OP_CREF: -- if (GET2(code, 1) == CREF_RECURSE) -- fprintf(f, " Cond recurse"); -+ case OP_RREF: -+ c = GET2(code, 1); -+ if (c == RREF_ANY) -+ fprintf(f, " Cond recurse any"); - else -- fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]); -+ fprintf(f, " Cond recurse %d", c); -+ break; -+ -+ case OP_DEF: -+ fprintf(f, " Cond def"); - break; - - case OP_STAR: - case OP_MINSTAR: -+ case OP_POSSTAR: - case OP_PLUS: - case OP_MINPLUS: -+ case OP_POSPLUS: - case OP_QUERY: - case OP_MINQUERY: -+ case OP_POSQUERY: - case OP_TYPESTAR: - case OP_TYPEMINSTAR: -+ case OP_TYPEPOSSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: -+ case OP_TYPEPOSPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: -+ case OP_TYPEPOSQUERY: - fprintf(f, " "); - if (*code >= OP_TYPESTAR) - { -@@ -257,17 +277,20 @@ - case OP_EXACT: - case OP_UPTO: - case OP_MINUPTO: -+ case OP_POSUPTO: - fprintf(f, " "); - extra = print_char(f, code+3, utf8); - fprintf(f, "{"); -- if (*code != OP_EXACT) fprintf(f, ","); -+ if (*code != OP_EXACT) fprintf(f, "0,"); - fprintf(f, "%d}", GET2(code,1)); - if (*code == OP_MINUPTO) fprintf(f, "?"); -+ else if (*code == OP_POSUPTO) fprintf(f, "+"); - break; - - case OP_TYPEEXACT: - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: -+ case OP_TYPEPOSUPTO: - fprintf(f, " %s", OP_names[code[3]]); - if (code[3] == OP_PROP || code[3] == OP_NOTPROP) - { -@@ -278,20 +301,26 @@ - if (*code != OP_TYPEEXACT) fprintf(f, "0,"); - fprintf(f, "%d}", GET2(code,1)); - if (*code == OP_TYPEMINUPTO) fprintf(f, "?"); -+ else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+"); - break; - - case OP_NOT: -- if (isprint(c = code[1])) fprintf(f, " [^%c]", c); -+ c = code[1]; -+ if (PRINTABLE(c)) fprintf(f, " [^%c]", c); - else fprintf(f, " [^\\x%02x]", c); - break; - - case OP_NOTSTAR: - case OP_NOTMINSTAR: -+ case OP_NOTPOSSTAR: - case OP_NOTPLUS: - case OP_NOTMINPLUS: -+ case OP_NOTPOSPLUS: - case OP_NOTQUERY: - case OP_NOTMINQUERY: -- if (isprint(c = code[1])) fprintf(f, " [^%c]", c); -+ case OP_NOTPOSQUERY: -+ c = code[1]; -+ if (PRINTABLE(c)) fprintf(f, " [^%c]", c); - else fprintf(f, " [^\\x%02x]", c); - fprintf(f, "%s", OP_names[*code]); - break; -@@ -299,11 +328,14 @@ - case OP_NOTEXACT: - case OP_NOTUPTO: - case OP_NOTMINUPTO: -- if (isprint(c = code[3])) fprintf(f, " [^%c]{", c); -+ case OP_NOTPOSUPTO: -+ c = code[3]; -+ if (PRINTABLE(c)) fprintf(f, " [^%c]{", c); - else fprintf(f, " [^\\x%02x]{", c); - if (*code != OP_NOTEXACT) fprintf(f, "0,"); - fprintf(f, "%d}", GET2(code,1)); - if (*code == OP_NOTMINUPTO) fprintf(f, "?"); -+ else if (*code == OP_NOTPOSUPTO) fprintf(f, "+"); - break; - - case OP_RECURSE: -@@ -363,12 +395,14 @@ - for (j = i+1; j < 256; j++) - if ((ccode[j/8] & (1 << (j&7))) == 0) break; - if (i == '-' || i == ']') fprintf(f, "\\"); -- if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i); -+ if (PRINTABLE(i)) fprintf(f, "%c", i); -+ else fprintf(f, "\\x%02x", i); - if (--j > i) - { - if (j != i + 1) fprintf(f, "-"); - if (j == '-' || j == ']') fprintf(f, "\\"); -- if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j); -+ if (PRINTABLE(j)) fprintf(f, "%c", j); -+ else fprintf(f, "\\x%02x", j); - } - i = j; - } -diff -ruN ../pcre.orig/pcrelib/pcre_scanner.cc ./pcrelib/pcre_scanner.cc ---- ../pcre.orig/pcrelib/pcre_scanner.cc Mon Mar 6 22:45:57 2006 -+++ ./pcrelib/pcre_scanner.cc Fri Feb 9 22:31:20 2007 -@@ -43,6 +43,7 @@ - input_(data_), - skip_(NULL), - should_skip_(false), -+ skip_repeat_(false), - save_comments_(false), - comments_(NULL), - comments_offset_(0) { -@@ -53,6 +54,7 @@ - input_(data_), - skip_(NULL), - should_skip_(false), -+ skip_repeat_(false), - save_comments_(false), - comments_(NULL), - comments_offset_(0) { -@@ -63,15 +65,31 @@ - delete comments_; - } - -+void Scanner::SetSkipExpression(const char* re) { -+ delete skip_; -+ if (re != NULL) { -+ skip_ = new RE(re); -+ should_skip_ = true; -+ skip_repeat_ = true; -+ ConsumeSkip(); -+ } else { -+ skip_ = NULL; -+ should_skip_ = false; -+ skip_repeat_ = false; -+ } -+} -+ - void Scanner::Skip(const char* re) { - delete skip_; - if (re != NULL) { - skip_ = new RE(re); - should_skip_ = true; -+ skip_repeat_ = false; - ConsumeSkip(); - } else { - skip_ = NULL; - should_skip_ = false; -+ skip_repeat_ = false; - } - } - -@@ -118,19 +136,22 @@ - - // helper function to consume *skip_ and honour save_comments_ - void Scanner::ConsumeSkip() { -+ const char* start_data = input_.data(); -+ while (skip_->Consume(&input_)) { -+ if (!skip_repeat_) { -+ // Only one skip allowed. -+ break; -+ } -+ } - if (save_comments_) { -- if (NULL == comments_) { -+ if (comments_ == NULL) { - comments_ = new vector<StringPiece>; - } -- const char *start_data = input_.data(); -- skip_->Consume(&input_); - // already pointing one past end, so no need to +1 - int length = input_.data() - start_data; - if (length > 0) { - comments_->push_back(StringPiece(start_data, length)); - } -- } else { -- skip_->Consume(&input_); - } - } - -diff -ruN ../pcre.orig/pcrelib/pcre_scanner.h ./pcrelib/pcre_scanner.h ---- ../pcre.orig/pcrelib/pcre_scanner.h Tue Aug 9 01:59:00 2005 -+++ ./pcrelib/pcre_scanner.h Fri Feb 9 22:31:20 2007 -@@ -36,7 +36,7 @@ - // Scanner scanner(input); - // string var; - // int number; --// scanner.Skip("\\s+"); // Skip any white space we encounter -+// scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter - // while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) { - // ...; - // } -@@ -90,10 +90,16 @@ - // skipped. For example, a programming language scanner would use - // a skip RE that matches white space and comments. - // -- // scanner.Skip("(\\s|//.*|/[*](.|\n)*?[*]/)*"); -+ // scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/"); -+ // -+ // Skipping repeats as long as it succeeds. We used to let people do -+ // this by writing "(...)*" in the regular expression, but that added -+ // up to lots of recursive calls within the pcre library, so now we -+ // control repetition explicitly via the function call API. - // - // You can pass NULL for "re" if you do not want any data to be skipped. -- void Skip(const char* re); -+ void Skip(const char* re); // DEPRECATED; does *not* repeat -+ void SetSkipExpression(const char* re); - - // Temporarily pause "skip"ing. This - // Skip("Foo"); code ; DisableSkip(); code; EnableSkip() -@@ -109,12 +115,13 @@ - /***** Special wrappers around SetSkip() for some common idioms *****/ - - // Arranges to skip whitespace, C comments, C++ comments. -- // The overall RE is a repeated disjunction of the following REs: -+ // The overall RE is a disjunction of the following REs: - // \\s whitespace - // //.*\n C++ comment - // /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x) -+ // We get repetition via the semantics of SetSkipExpression, not by using * - void SkipCXXComments() { -- Skip("((\\s|//.*\n|/[*](.|\n)*?[*]/)*)"); -+ SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/"); - } - - void set_save_comments(bool comments) { -@@ -143,6 +150,7 @@ - StringPiece input_; // Unprocessed input - RE* skip_; // If non-NULL, RE for skipping input - bool should_skip_; // If true, use skip_ -+ bool skip_repeat_; // If true, repeat skip_ as long as it works - bool save_comments_; // If true, aggregate the skip expression - - // the skipped comments -diff -ruN ../pcre.orig/pcrelib/pcre_scanner_unittest.cc ./pcrelib/pcre_scanner_unittest.cc ---- ../pcre.orig/pcrelib/pcre_scanner_unittest.cc Mon Mar 6 22:45:57 2006 -+++ ./pcrelib/pcre_scanner_unittest.cc Fri Feb 9 22:31:20 2007 -@@ -33,10 +33,13 @@ - // functionality. - - #include <stdio.h> -+#include <string> - #include <vector> - #include <pcre_stringpiece.h> - #include <pcre_scanner.h> - -+#define FLAGS_unittest_stack_size 49152 -+ - // Dies with a fatal error if the two values are not equal. - #define CHECK_EQ(a, b) do { \ - if ( (a) != (b) ) { \ -@@ -116,8 +119,31 @@ - comments.resize(0); - } - -+static void TestBigComment() { -+ string input; -+ for (int i = 0; i < 1024; ++i) { -+ char buf[1024]; -+ snprintf(buf, sizeof(buf), " # Comment %d\n", i); -+ input += buf; -+ } -+ input += "name = value;\n"; -+ -+ Scanner s(input.c_str()); -+ s.SetSkipExpression("\\s+|#.*\n"); -+ -+ string name; -+ string value; -+ s.Consume("(\\w+) = (\\w+);", &name, &value); -+ CHECK_EQ(name, "name"); -+ CHECK_EQ(value, "value"); -+} -+ -+// TODO: also test scanner and big-comment in a thread with a -+// small stack size -+ - int main(int argc, char** argv) { - TestScanner(); -+ TestBigComment(); - - // Done - printf("OK\n"); -diff -ruN ../pcre.orig/pcrelib/pcre_study.c ./pcrelib/pcre_study.c ---- ../pcre.orig/pcrelib/pcre_study.c Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/pcre_study.c Fri Feb 9 22:31:20 2007 -@@ -6,7 +6,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -45,6 +45,11 @@ - #include "pcre_internal.h" - - -+/* Returns from set_start_bits() */ -+ -+enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE }; -+ -+ - /************************************************* - * Set a bit and maybe its alternate case * - *************************************************/ -@@ -72,12 +77,16 @@ - - - /************************************************* --* Create bitmap of starting chars * -+* Create bitmap of starting bytes * - *************************************************/ - --/* This function scans a compiled unanchored expression and attempts to build a --bitmap of the set of initial characters. If it can't, it returns FALSE. As time --goes by, we may be able to get more clever at doing this. -+/* This function scans a compiled unanchored expression recursively and -+attempts to build a bitmap of the set of possible starting bytes. As time goes -+by, we may be able to get more clever at doing this. The SSB_CONTINUE return is -+useful for parenthesized groups in patterns such as (a*)b where the group -+provides some optional starting bytes but scanning must continue at the outer -+level to find at least one mandatory byte. At the outermost level, this -+function fails unless the result is SSB_DONE. - - Arguments: - code points to an expression -@@ -86,14 +95,17 @@ - utf8 TRUE if in UTF-8 mode - cd the block with char table pointers - --Returns: TRUE if table built, FALSE otherwise -+Returns: SSB_FAIL => Failed to find any starting bytes -+ SSB_DONE => Found mandatory starting bytes -+ SSB_CONTINUE => Found optional starting bytes - */ - --static BOOL -+static int - set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, - BOOL utf8, compile_data *cd) - { - register int c; -+int yield = SSB_DONE; - - #if 0 - /* ========================================================================= */ -@@ -114,36 +126,60 @@ - - do - { -- const uschar *tcode = code + 1 + LINK_SIZE; -+ const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE; - BOOL try_next = TRUE; - -- while (try_next) -+ while (try_next) /* Loop for items in this branch */ - { -- /* If a branch starts with a bracket or a positive lookahead assertion, -- recurse to set bits from within them. That's all for this branch. */ -- -- if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) -+ int rc; -+ switch(*tcode) - { -- if (!set_start_bits(tcode, start_bits, caseless, utf8, cd)) -- return FALSE; -- try_next = FALSE; -- } -+ /* Fail if we reach something we don't understand */ - -- else switch(*tcode) -- { - default: -- return FALSE; -+ return SSB_FAIL; - -- /* Skip over callout */ -+ /* If we hit a bracket or a positive lookahead assertion, recurse to set -+ bits from within the subpattern. If it can't find anything, we have to -+ give up. If it finds some mandatory character(s), we are done for this -+ branch. Otherwise, carry on scanning after the subpattern. */ -+ -+ case OP_BRA: -+ case OP_SBRA: -+ case OP_CBRA: -+ case OP_SCBRA: -+ case OP_ONCE: -+ case OP_ASSERT: -+ rc = set_start_bits(tcode, start_bits, caseless, utf8, cd); -+ if (rc == SSB_FAIL) return SSB_FAIL; -+ if (rc == SSB_DONE) try_next = FALSE; else -+ { -+ do tcode += GET(tcode, 1); while (*tcode == OP_ALT); -+ tcode += 1 + LINK_SIZE; -+ } -+ break; - -- case OP_CALLOUT: -- tcode += 2 + 2*LINK_SIZE; -+ /* If we hit ALT or KET, it means we haven't found anything mandatory in -+ this branch, though we might have found something optional. For ALT, we -+ continue with the next alternative, but we have to arrange that the final -+ result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, -+ return SSB_CONTINUE: if this is the top level, that indicates failure, -+ but after a nested subpattern, it causes scanning to continue. */ -+ -+ case OP_ALT: -+ yield = SSB_CONTINUE; -+ try_next = FALSE; - break; - -- /* Skip over extended extraction bracket number */ -+ case OP_KET: -+ case OP_KETRMAX: -+ case OP_KETRMIN: -+ return SSB_CONTINUE; - -- case OP_BRANUMBER: -- tcode += 3; -+ /* Skip over callout */ -+ -+ case OP_CALLOUT: -+ tcode += 2 + 2*LINK_SIZE; - break; - - /* Skip over lookbehind and negative lookahead assertions */ -@@ -152,7 +188,7 @@ - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - do tcode += GET(tcode, 1); while (*tcode == OP_ALT); -- tcode += 1+LINK_SIZE; -+ tcode += 1 + LINK_SIZE; - break; - - /* Skip over an option setting, changing the caseless flag */ -@@ -166,27 +202,30 @@ - - case OP_BRAZERO: - case OP_BRAMINZERO: -- if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd)) -- return FALSE; -+ if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL) -+ return SSB_FAIL; - /* ========================================================================= - See the comment at the head of this function concerning the next line, - which was an old fudge for the benefit of OS/2. - dummy = 1; - ========================================================================= */ - do tcode += GET(tcode,1); while (*tcode == OP_ALT); -- tcode += 1+LINK_SIZE; -+ tcode += 1 + LINK_SIZE; - break; - - /* Single-char * or ? sets the bit and tries the next item */ - - case OP_STAR: - case OP_MINSTAR: -+ case OP_POSSTAR: - case OP_QUERY: - case OP_MINQUERY: -+ case OP_POSQUERY: - set_bit(start_bits, tcode[1], caseless, cd); - tcode += 2; - #ifdef SUPPORT_UTF8 -- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; -+ if (utf8 && tcode[-1] >= 0xc0) -+ tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; - #endif - break; - -@@ -194,10 +233,12 @@ - - case OP_UPTO: - case OP_MINUPTO: -+ case OP_POSUPTO: - set_bit(start_bits, tcode[3], caseless, cd); - tcode += 4; - #ifdef SUPPORT_UTF8 -- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; -+ if (utf8 && tcode[-1] >= 0xc0) -+ tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; - #endif - break; - -@@ -210,6 +251,7 @@ - case OP_CHARNC: - case OP_PLUS: - case OP_MINPLUS: -+ case OP_POSPLUS: - set_bit(start_bits, tcode[1], caseless, cd); - try_next = FALSE; - break; -@@ -283,16 +325,19 @@ - - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: -+ case OP_TYPEPOSUPTO: - tcode += 2; /* Fall through */ - - case OP_TYPESTAR: - case OP_TYPEMINSTAR: -+ case OP_TYPEPOSSTAR: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: -+ case OP_TYPEPOSQUERY: - switch(tcode[1]) - { - case OP_ANY: -- return FALSE; -+ return SSB_FAIL; - - case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) -@@ -418,7 +463,7 @@ - code += GET(code, 1); /* Advance to next branch */ - } - while (*code == OP_ALT); --return TRUE; -+return yield; - } - - -@@ -492,8 +537,8 @@ - /* See if we can find a fixed set of initial characters for the pattern. */ - - memset(start_bits, 0, 32 * sizeof(uschar)); --if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, -- (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL; -+if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, -+ (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL; - - /* Get a pcre_extra block and a pcre_study_data block. The study data is put in - the latter, which is pointed to by the former, which may also get additional -diff -ruN ../pcre.orig/pcrelib/pcre_tables.c ./pcrelib/pcre_tables.c ---- ../pcre.orig/pcrelib/pcre_tables.c Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/pcre_tables.c Fri Feb 9 22:31:20 2007 -@@ -6,7 +6,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -72,9 +72,8 @@ - const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; - const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; - --/* Table of the number of extra characters, indexed by the first character --masked with 0x3f. The highest number for a valid UTF-8 character is in fact --0x3d. */ -+/* Table of the number of extra bytes, indexed by the first byte masked with -+0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ - - const uschar _pcre_utf8_table4[] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -@@ -89,6 +88,7 @@ - { "Any", PT_ANY, 0 }, - { "Arabic", PT_SC, ucp_Arabic }, - { "Armenian", PT_SC, ucp_Armenian }, -+ { "Balinese", PT_SC, ucp_Balinese }, - { "Bengali", PT_SC, ucp_Bengali }, - { "Bopomofo", PT_SC, ucp_Bopomofo }, - { "Braille", PT_SC, ucp_Braille }, -@@ -104,6 +104,7 @@ - { "Common", PT_SC, ucp_Common }, - { "Coptic", PT_SC, ucp_Coptic }, - { "Cs", PT_PC, ucp_Cs }, -+ { "Cuneiform", PT_SC, ucp_Cuneiform }, - { "Cypriot", PT_SC, ucp_Cypriot }, - { "Cyrillic", PT_SC, ucp_Cyrillic }, - { "Deseret", PT_SC, ucp_Deseret }, -@@ -146,6 +147,7 @@ - { "N", PT_GC, ucp_N }, - { "Nd", PT_PC, ucp_Nd }, - { "New_Tai_Lue", PT_SC, ucp_New_Tai_Lue }, -+ { "Nko", PT_SC, ucp_Nko }, - { "Nl", PT_PC, ucp_Nl }, - { "No", PT_PC, ucp_No }, - { "Ogham", PT_SC, ucp_Ogham }, -@@ -158,6 +160,8 @@ - { "Pd", PT_PC, ucp_Pd }, - { "Pe", PT_PC, ucp_Pe }, - { "Pf", PT_PC, ucp_Pf }, -+ { "Phags_Pa", PT_SC, ucp_Phags_Pa }, -+ { "Phoenician", PT_SC, ucp_Phoenician }, - { "Pi", PT_PC, ucp_Pi }, - { "Po", PT_PC, ucp_Po }, - { "Ps", PT_PC, ucp_Ps }, -diff -ruN ../pcre.orig/pcrelib/pcre_ucp_searchfuncs.c ./pcrelib/pcre_ucp_searchfuncs.c ---- ../pcre.orig/pcrelib/pcre_ucp_searchfuncs.c Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/pcre_ucp_searchfuncs.c Fri Feb 9 22:31:20 2007 -@@ -6,7 +6,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -131,11 +131,11 @@ - Arguments: - c the character value - --Returns: the other case or -1 if none -+Returns: the other case or NOTACHAR if none - */ - --int --_pcre_ucp_othercase(const int c) -+unsigned int -+_pcre_ucp_othercase(const unsigned int c) - { - int bot = 0; - int top = sizeof(ucp_table)/sizeof(cnode); -@@ -161,14 +161,14 @@ - } - } - --/* Found an entry in the table. Return -1 for a range entry. Otherwise return --the other case if there is one, else -1. */ -+/* Found an entry in the table. Return NOTACHAR for a range entry. Otherwise -+return the other case if there is one, else NOTACHAR. */ - --if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return -1; -+if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return NOTACHAR; - - offset = ucp_table[mid].f1 & f1_casemask; - if ((offset & f1_caseneg) != 0) offset |= f1_caseneg; --return (offset == 0)? -1 : c + offset; -+return (offset == 0)? NOTACHAR : c + offset; - } - - -diff -ruN ../pcre.orig/pcrelib/pcre_valid_utf8.c ./pcrelib/pcre_valid_utf8.c ---- ../pcre.orig/pcrelib/pcre_valid_utf8.c Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/pcre_valid_utf8.c Fri Feb 9 22:31:20 2007 -@@ -6,7 +6,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -79,7 +79,7 @@ - register int ab; - register int c = *p; - if (c < 128) continue; -- if ((c & 0xc0) != 0xc0) return p - string; -+ if (c < 0xc0) return p - string; - ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ - if (length < ab) return p - string; - length -= ab; -diff -ruN ../pcre.orig/pcrelib/pcre_version.c ./pcrelib/pcre_version.c ---- ../pcre.orig/pcrelib/pcre_version.c Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/pcre_version.c Fri Feb 9 22:31:20 2007 -@@ -6,7 +6,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -49,16 +49,38 @@ - * Return version string * - *************************************************/ - -+/* These macros are the standard way of turning unquoted text into C strings. -+They allow macros like PCRE_MAJOR to be defined without quotes, which is -+convenient for user programs that want to test its value. */ -+ - #define STRING(a) # a - #define XSTRING(s) STRING(s) - -+/* A problem turned up with PCRE_PRERELEASE, which is defined empty for -+production releases. Originally, it was used naively in this code: -+ -+ return XSTRING(PCRE_MAJOR) -+ "." XSTRING(PCRE_MINOR) -+ XSTRING(PCRE_PRERELEASE) -+ " " XSTRING(PCRE_DATE); -+ -+However, when PCRE_PRERELEASE is empty, this leads to an attempted expansion of -+STRING(). The C standard states: "If (before argument substitution) any -+argument consists of no preprocessing tokens, the behavior is undefined." It -+turns out the gcc treats this case as a single empty string - which is what we -+really want - but Visual C grumbles about the lack of an argument for the -+macro. Unfortunately, both are within their rights. To cope with both ways of -+handling this, I had resort to some messy hackery that does a test at run time. -+I could find no way of detecting that a macro is defined as an empty string at -+pre-processor time. This hack uses a standard trick for avoiding calling -+the STRING macro with an empty argument when doing the test. */ -+ - PCRE_DATA_SCOPE const char * - pcre_version(void) - { --return XSTRING(PCRE_MAJOR) -- "." XSTRING(PCRE_MINOR) -- XSTRING(PCRE_PRERELEASE) -- " " XSTRING(PCRE_DATE); -+return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)? -+ XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) : -+ XSTRING(PCRE_MAJOR.PCRE_MINOR) XSTRING(PCRE_PRERELEASE PCRE_DATE); - } - - /* End of pcre_version.c */ -diff -ruN ../pcre.orig/pcrelib/pcrecpp.cc ./pcrelib/pcrecpp.cc ---- ../pcre.orig/pcrelib/pcrecpp.cc Wed Aug 30 22:00:22 2006 -+++ ./pcrelib/pcrecpp.cc Fri Feb 9 22:31:20 2007 -@@ -61,7 +61,7 @@ - // If the user doesn't ask for any options, we just use this one - static RE_Options default_options; - --void RE::Init(const char* pat, const RE_Options* options) { -+void RE::Init(const string& pat, const RE_Options* options) { - pattern_ = pat; - if (options == NULL) { - options_ = default_options; -@@ -78,7 +78,7 @@ - // conservative in that it may treat some "simple" patterns - // as "complex" (e.g., if the vertical bar is in a character - // class or is escaped). But it seems good enough. -- if (strchr(pat, '|') == NULL) { -+ if (strchr(pat.c_str(), '|') == NULL) { - // Simple pattern: we can use position-based checks to perform - // fully anchored matches - re_full_ = re_partial_; -@@ -89,12 +89,18 @@ - } - } - --RE::~RE() { -+void RE::Cleanup() { - if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_); - if (re_partial_ != NULL) (*pcre_free)(re_partial_); - if (error_ != &empty_string) delete error_; - } - -+ -+RE::~RE() { -+ Cleanup(); -+} -+ -+ - pcre* RE::Compile(Anchor anchor) { - // First, convert RE_Options into pcre options - int pcre_options = 0; -@@ -424,6 +430,34 @@ - return Rewrite(out, rewrite, text, vec, matches); - } - -+/*static*/ string RE::QuoteMeta(const StringPiece& unquoted) { -+ string result; -+ -+ // Escape any ascii character not in [A-Za-z_0-9]. -+ // -+ // Note that it's legal to escape a character even if it has no -+ // special meaning in a regular expression -- so this function does -+ // that. (This also makes it identical to the perl function of the -+ // same name; see `perldoc -f quotemeta`.) -+ for (int ii = 0; ii < unquoted.size(); ++ii) { -+ // Note that using 'isalnum' here raises the benchmark time from -+ // 32ns to 58ns: -+ if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && -+ (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && -+ (unquoted[ii] < '0' || unquoted[ii] > '9') && -+ unquoted[ii] != '_' && -+ // If this is the part of a UTF8 or Latin1 character, we need -+ // to copy this byte without escaping. Experimentally this is -+ // what works correctly with the regexp library. -+ !(unquoted[ii] & 128)) { -+ result += '\\'; -+ } -+ result += unquoted[ii]; -+ } -+ -+ return result; -+} -+ - /***** Actual matching and rewriting code *****/ - - int RE::TryMatch(const StringPiece& text, -@@ -809,14 +843,14 @@ - return parse_##name##_radix(str, n, dest, 0); \ - } - --DEFINE_INTEGER_PARSERS(short); --DEFINE_INTEGER_PARSERS(ushort); --DEFINE_INTEGER_PARSERS(int); --DEFINE_INTEGER_PARSERS(uint); --DEFINE_INTEGER_PARSERS(long); --DEFINE_INTEGER_PARSERS(ulong); --DEFINE_INTEGER_PARSERS(longlong); --DEFINE_INTEGER_PARSERS(ulonglong); -+DEFINE_INTEGER_PARSERS(short) /* */ -+DEFINE_INTEGER_PARSERS(ushort) /* */ -+DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */ -+DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */ -+DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */ -+DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */ -+DEFINE_INTEGER_PARSERS(longlong) /* */ -+DEFINE_INTEGER_PARSERS(ulonglong) /* */ - - #undef DEFINE_INTEGER_PARSERS - -diff -ruN ../pcre.orig/pcrelib/pcrecpp.h ./pcrelib/pcrecpp.h ---- ../pcre.orig/pcrelib/pcrecpp.h Mon Mar 6 22:45:57 2006 -+++ ./pcrelib/pcrecpp.h Fri Feb 9 22:31:20 2007 -@@ -112,6 +112,12 @@ - // T (where "bool T::ParseFrom(const char*, int)" exists) - // NULL (the corresponding matched sub-pattern is not copied) - // -+// CAVEAT: An optional sub-pattern that does not exist in the matched -+// string is assigned the empty string. Therefore, the following will -+// return false (because the empty string is not a valid number): -+// int number; -+// pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number); -+// - // ----------------------------------------------------------------------- - // DO_MATCH - // -@@ -488,8 +494,25 @@ - // pass in a string or a "const char*" wherever an "RE" is expected. - RE(const char* pat) { Init(pat, NULL); } - RE(const char *pat, const RE_Options& option) { Init(pat, &option); } -- RE(const string& pat) { Init(pat.c_str(), NULL); } -- RE(const string& pat, const RE_Options& option) { Init(pat.c_str(), &option); } -+ RE(const string& pat) { Init(pat, NULL); } -+ RE(const string& pat, const RE_Options& option) { Init(pat, &option); } -+ -+ // Copy constructor & assignment - note that these are expensive -+ // because they recompile the expression. -+ RE(const RE& re) { Init(re.pattern_, &re.options_); } -+ const RE& operator=(const RE& re) { -+ if (this != &re) { -+ Cleanup(); -+ -+ // This is the code that originally came from Google -+ // Init(re.pattern_.c_str(), &re.options_); -+ -+ // This is the replacement from Ari Pollak -+ Init(re.pattern_, &re.options_); -+ } -+ return *this; -+ } -+ - - ~RE(); - -@@ -589,6 +612,15 @@ - const StringPiece &text, - string *out) const; - -+ // Escapes all potentially meaningful regexp characters in -+ // 'unquoted'. The returned string, used as a regular expression, -+ // will exactly match the original string. For example, -+ // 1.5-2.0? -+ // may become: -+ // 1\.5\-2\.0\? -+ static string QuoteMeta(const StringPiece& unquoted); -+ -+ - /***** Generic matching interface *****/ - - // Type of match (TODO: Should be restructured as part of RE_Options) -@@ -611,7 +643,8 @@ - - private: - -- void Init(const char* pattern, const RE_Options* options); -+ void Init(const string& pattern, const RE_Options* options); -+ void Cleanup(); - - // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with - // pairs of integers for the beginning and end positions of matched -@@ -655,11 +688,6 @@ - pcre* re_full_; // For full matches - pcre* re_partial_; // For partial matches - const string* error_; // Error indicator (or points to empty string) -- -- // Don't allow the default copy or assignment constructors -- -- // they're expensive and too easy to do by accident. -- RE(const RE&); -- void operator=(const RE&); - }; - - } // namespace pcrecpp -diff -ruN ../pcre.orig/pcrelib/pcrecpp_unittest.cc ./pcrelib/pcrecpp_unittest.cc ---- ../pcre.orig/pcrelib/pcrecpp_unittest.cc Wed Aug 30 22:00:22 2006 -+++ ./pcrelib/pcrecpp_unittest.cc Fri Feb 9 22:31:20 2007 -@@ -1,4 +1,6 @@ --// Copyright (c) 2005, Google Inc. -+// -*- coding: utf-8 -*- -+// -+// Copyright (c) 2005 - 2006, Google Inc. - // All rights reserved. - // - // Redistribution and use in source and binary forms, with or without -@@ -445,6 +447,80 @@ - CHECK(re4.FullMatch(text_bad) == false); - } - -+// A meta-quoted string, interpreted as a pattern, should always match -+// the original unquoted string. -+static void TestQuoteMeta(string unquoted, RE_Options options = RE_Options()) { -+ string quoted = RE::QuoteMeta(unquoted); -+ RE re(quoted, options); -+ CHECK(re.FullMatch(unquoted)); -+} -+ -+// A string containing meaningful regexp characters, which is then meta- -+// quoted, should not generally match a string the unquoted string does. -+static void NegativeTestQuoteMeta(string unquoted, string should_not_match, -+ RE_Options options = RE_Options()) { -+ string quoted = RE::QuoteMeta(unquoted); -+ RE re(quoted, options); -+ CHECK(!re.FullMatch(should_not_match)); -+} -+ -+// Tests that quoted meta characters match their original strings, -+// and that a few things that shouldn't match indeed do not. -+static void TestQuotaMetaSimple() { -+ TestQuoteMeta("foo"); -+ TestQuoteMeta("foo.bar"); -+ TestQuoteMeta("foo\\.bar"); -+ TestQuoteMeta("[1-9]"); -+ TestQuoteMeta("1.5-2.0?"); -+ TestQuoteMeta("\\d"); -+ TestQuoteMeta("Who doesn't like ice cream?"); -+ TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); -+ TestQuoteMeta("((?!)xxx).*yyy"); -+ TestQuoteMeta("(["); -+} -+ -+static void TestQuoteMetaSimpleNegative() { -+ NegativeTestQuoteMeta("foo", "bar"); -+ NegativeTestQuoteMeta("...", "bar"); -+ NegativeTestQuoteMeta("\\.", "."); -+ NegativeTestQuoteMeta("\\.", ".."); -+ NegativeTestQuoteMeta("(a)", "a"); -+ NegativeTestQuoteMeta("(a|b)", "a"); -+ NegativeTestQuoteMeta("(a|b)", "(a)"); -+ NegativeTestQuoteMeta("(a|b)", "a|b"); -+ NegativeTestQuoteMeta("[0-9]", "0"); -+ NegativeTestQuoteMeta("[0-9]", "0-9"); -+ NegativeTestQuoteMeta("[0-9]", "[9]"); -+ NegativeTestQuoteMeta("((?!)xxx)", "xxx"); -+} -+ -+static void TestQuoteMetaLatin1() { -+ TestQuoteMeta("3\xb2 = 9"); -+} -+ -+static void TestQuoteMetaUtf8() { -+#ifdef SUPPORT_UTF8 -+ TestQuoteMeta("Pl\xc3\xa1\x63ido Domingo", pcrecpp::UTF8()); -+ TestQuoteMeta("xyz", pcrecpp::UTF8()); // No fancy utf8 -+ TestQuoteMeta("\xc2\xb0", pcrecpp::UTF8()); // 2-byte utf8 (degree symbol) -+ TestQuoteMeta("27\xc2\xb0 degrees", pcrecpp::UTF8()); // As a middle character -+ TestQuoteMeta("\xe2\x80\xb3", pcrecpp::UTF8()); // 3-byte utf8 (double prime) -+ TestQuoteMeta("\xf0\x9d\x85\x9f", pcrecpp::UTF8()); // 4-byte utf8 (music note) -+ TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, but should still work -+ NegativeTestQuoteMeta("27\xc2\xb0", // 2-byte utf (degree symbol) -+ "27\\\xc2\\\xb0", -+ pcrecpp::UTF8()); -+#endif -+} -+ -+static void TestQuoteMetaAll() { -+ printf("Testing QuoteMeta\n"); -+ TestQuotaMetaSimple(); -+ TestQuoteMetaSimpleNegative(); -+ TestQuoteMetaLatin1(); -+ TestQuoteMetaUtf8(); -+} -+ - // - // Options tests contributed by - // Giuseppe Maxia, CTO, Stardata s.r.l. -@@ -667,6 +743,35 @@ - Test_all_options(); - } - -+static void TestConstructors() { -+ printf("Testing constructors\n"); -+ -+ RE_Options options; -+ options.set_dotall(true); -+ const char *str = "HELLO\n" "cruel\n" "world"; -+ -+ RE orig("HELLO.*world", options); -+ CHECK(orig.FullMatch(str)); -+ -+ RE copy1(orig); -+ CHECK(copy1.FullMatch(str)); -+ -+ RE copy2("not a match"); -+ CHECK(!copy2.FullMatch(str)); -+ copy2 = copy1; -+ CHECK(copy2.FullMatch(str)); -+ copy2 = orig; -+ CHECK(copy2.FullMatch(str)); -+ -+ // Make sure when we assign to ourselves, nothing bad happens -+ orig = orig; -+ copy1 = copy1; -+ copy2 = copy2; -+ CHECK(orig.FullMatch(str)); -+ CHECK(copy1.FullMatch(str)); -+ CHECK(copy2.FullMatch(str)); -+} -+ - int main(int argc, char** argv) { - // Treat any flag as --help - if (argc > 1 && argv[1][0] == '-') { -@@ -985,11 +1090,14 @@ - CHECK(RE("h.*o").PartialMatch("hello!")); - CHECK(RE("((((((((((((((((((((x))))))))))))))))))))").PartialMatch("x")); - -+ /***** other tests *****/ -+ - RadixTests(); - TestReplace(); - TestExtract(); - TestConsume(); - TestFindAndConsume(); -+ TestQuoteMetaAll(); - TestMatchNumberPeculiarity(); - - // Check the pattern() accessor -@@ -1108,6 +1216,9 @@ - if (getenv("VERBOSE_TEST") != NULL) - VERBOSE_TEST = true; - TestOptions(); -+ -+ // Test the constructors -+ TestConstructors(); - - // Done - printf("OK\n"); -diff -ruN ../pcre.orig/pcrelib/pcregrep.c ./pcrelib/pcregrep.c ---- ../pcre.orig/pcrelib/pcregrep.c Wed Jan 3 21:08:37 2007 -+++ ./pcrelib/pcregrep.c Tue Feb 27 04:31:14 2007 -@@ -6,7 +6,7 @@ - its pattern matching. On a Unix or Win32 system it can recurse into - directories. - -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -56,7 +56,7 @@ - - typedef int BOOL; - --#define VERSION "4.3 01-Jun-2006" -+#define VERSION "4.4 29-Nov-2006" - #define MAX_PATTERN_COUNT 100 - - #if BUFSIZ > 8192 -@@ -65,7 +65,6 @@ - #define MBUFTHIRD 8192 - #endif - -- - /* Values for the "filenames" variable, which specifies options for file name - output. The order is important; it is assumed that a file name is wanted for - all values greater than FN_DEFAULT. */ -@@ -83,6 +82,10 @@ - #define PO_LINE_MATCH 0x0002 - #define PO_FIXED_STRINGS 0x0004 - -+/* Line ending types */ -+ -+enum { EL_LF, EL_CR, EL_CRLF, EL_ANY }; -+ - - - /************************************************* -@@ -100,8 +103,7 @@ - static const char *jfriedl_postfix = ""; - #endif - --static int endlinebyte = '\n'; /* Last byte of endline sequence */ --static int endlineextra = 0; /* Extra bytes for endline sequence */ -+static int endlinetype; - - static char *colour_string = (char *)"1;31"; - static char *colour_option = NULL; -@@ -142,6 +144,7 @@ - static BOOL only_matching = FALSE; - static BOOL quiet = FALSE; - static BOOL silent = FALSE; -+static BOOL utf8 = FALSE; - - /* Structure for options and list of them */ - -@@ -219,6 +222,16 @@ - static const char *suffix[] = { - "", "\\b", ")$", ")$", "\\E", "\\E\\b", "\\E)$", "\\E)$" }; - -+/* UTF-8 tables - used only when the newline setting is "all". */ -+ -+const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; -+ -+const char utf8_table4[] = { -+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, -+ 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; -+ - - - /************************************************* -@@ -471,6 +484,216 @@ - - - /************************************************* -+* Find end of line * -+*************************************************/ -+ -+/* The length of the endline sequence that is found is set via lenptr. This may -+be zero at the very end of the file if there is no line-ending sequence there. -+ -+Arguments: -+ p current position in line -+ endptr end of available data -+ lenptr where to put the length of the eol sequence -+ -+Returns: pointer to the last byte of the line -+*/ -+ -+static char * -+end_of_line(char *p, char *endptr, int *lenptr) -+{ -+switch(endlinetype) -+ { -+ default: /* Just in case */ -+ case EL_LF: -+ while (p < endptr && *p != '\n') p++; -+ if (p < endptr) -+ { -+ *lenptr = 1; -+ return p + 1; -+ } -+ *lenptr = 0; -+ return endptr; -+ -+ case EL_CR: -+ while (p < endptr && *p != '\r') p++; -+ if (p < endptr) -+ { -+ *lenptr = 1; -+ return p + 1; -+ } -+ *lenptr = 0; -+ return endptr; -+ -+ case EL_CRLF: -+ for (;;) -+ { -+ while (p < endptr && *p != '\r') p++; -+ if (++p >= endptr) -+ { -+ *lenptr = 0; -+ return endptr; -+ } -+ if (*p == '\n') -+ { -+ *lenptr = 2; -+ return p + 1; -+ } -+ } -+ break; -+ -+ case EL_ANY: -+ while (p < endptr) -+ { -+ int extra = 0; -+ register int c = *((unsigned char *)p); -+ -+ if (utf8 && c >= 0xc0) -+ { -+ int gcii, gcss; -+ extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */ -+ gcss = 6*extra; -+ c = (c & utf8_table3[extra]) << gcss; -+ for (gcii = 1; gcii <= extra; gcii++) -+ { -+ gcss -= 6; -+ c |= (p[gcii] & 0x3f) << gcss; -+ } -+ } -+ -+ p += 1 + extra; -+ -+ switch (c) -+ { -+ case 0x0a: /* LF */ -+ case 0x0b: /* VT */ -+ case 0x0c: /* FF */ -+ *lenptr = 1; -+ return p; -+ -+ case 0x0d: /* CR */ -+ if (p < endptr && *p == 0x0a) -+ { -+ *lenptr = 2; -+ p++; -+ } -+ else *lenptr = 1; -+ return p; -+ -+ case 0x85: /* NEL */ -+ *lenptr = utf8? 2 : 1; -+ return p; -+ -+ case 0x2028: /* LS */ -+ case 0x2029: /* PS */ -+ *lenptr = 3; -+ return p; -+ -+ default: -+ break; -+ } -+ } /* End of loop for ANY case */ -+ -+ *lenptr = 0; /* Must have hit the end */ -+ return endptr; -+ } /* End of overall switch */ -+} -+ -+ -+ -+/************************************************* -+* Find start of previous line * -+*************************************************/ -+ -+/* This is called when looking back for before lines to print. -+ -+Arguments: -+ p start of the subsequent line -+ startptr start of available data -+ -+Returns: pointer to the start of the previous line -+*/ -+ -+static char * -+previous_line(char *p, char *startptr) -+{ -+switch(endlinetype) -+ { -+ default: /* Just in case */ -+ case EL_LF: -+ p--; -+ while (p > startptr && p[-1] != '\n') p--; -+ return p; -+ -+ case EL_CR: -+ p--; -+ while (p > startptr && p[-1] != '\n') p--; -+ return p; -+ -+ case EL_CRLF: -+ for (;;) -+ { -+ p -= 2; -+ while (p > startptr && p[-1] != '\n') p--; -+ if (p <= startptr + 1 || p[-2] == '\r') return p; -+ } -+ return p; /* But control should never get here */ -+ -+ case EL_ANY: -+ if (*(--p) == '\n' && p > startptr && p[-1] == '\r') p--; -+ if (utf8) while ((*p & 0xc0) == 0x80) p--; -+ -+ while (p > startptr) -+ { -+ register int c; -+ char *pp = p - 1; -+ -+ if (utf8) -+ { -+ int extra = 0; -+ while ((*pp & 0xc0) == 0x80) pp--; -+ c = *((unsigned char *)pp); -+ if (c >= 0xc0) -+ { -+ int gcii, gcss; -+ extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */ -+ gcss = 6*extra; -+ c = (c & utf8_table3[extra]) << gcss; -+ for (gcii = 1; gcii <= extra; gcii++) -+ { -+ gcss -= 6; -+ c |= (pp[gcii] & 0x3f) << gcss; -+ } -+ } -+ } -+ else c = *((unsigned char *)pp); -+ -+ switch (c) -+ { -+ case 0x0a: /* LF */ -+ case 0x0b: /* VT */ -+ case 0x0c: /* FF */ -+ case 0x0d: /* CR */ -+ case 0x85: /* NEL */ -+ case 0x2028: /* LS */ -+ case 0x2029: /* PS */ -+ return p; -+ -+ default: -+ break; -+ } -+ -+ p = pp; /* Back one character */ -+ } /* End of loop for ANY case */ -+ -+ return startptr; /* Hit start of data */ -+ } /* End of overall switch */ -+} -+ -+ -+ -+ -+ -+/************************************************* - * Print the previous "after" lines * - *************************************************/ - -@@ -495,13 +718,13 @@ - int count = 0; - while (lastmatchrestart < endptr && count++ < after_context) - { -+ int ellength; - char *pp = lastmatchrestart; - if (printname != NULL) fprintf(stdout, "%s-", printname); - if (number) fprintf(stdout, "%d-", lastmatchnumber++); -- while (*pp != endlinebyte) pp++; -- fwrite(lastmatchrestart, 1, pp - lastmatchrestart + (1 + endlineextra), -- stdout); -- lastmatchrestart = pp + 1; -+ pp = end_of_line(pp, endptr, &ellength); -+ fwrite(lastmatchrestart, 1, pp - lastmatchrestart, stdout); -+ lastmatchrestart = pp; - } - hyphenpending = TRUE; - } -@@ -558,7 +781,7 @@ - - while (ptr < endptr) - { -- int i; -+ int i, endlinelength; - int mrc = 0; - BOOL match = FALSE; - char *t = ptr; -@@ -571,11 +794,10 @@ - line. In multiline mode the PCRE_FIRSTLINE option is used for compiling, so - that any match is constrained to be in the first line. */ - -- linelength = 0; -- while (t < endptr && *t++ != endlinebyte) linelength++; -+ t = end_of_line(t, endptr, &endlinelength); -+ linelength = t - ptr - endlinelength; - length = multiline? endptr - ptr : linelength; - -- - /* Extra processing for Jeffrey Friedl's debugging. */ - - #ifdef JFRIEDL_DEBUG -@@ -706,13 +928,13 @@ - - if (after_context > 0 && lastmatchnumber > 0) - { -+ int ellength; - int linecount = 0; - char *p = lastmatchrestart; - - while (p < ptr && linecount < after_context) - { -- while (*p != endlinebyte) p++; -- p++; -+ p = end_of_line(p, ptr, &ellength); - linecount++; - } - -@@ -725,10 +947,9 @@ - char *pp = lastmatchrestart; - if (printname != NULL) fprintf(stdout, "%s-", printname); - if (number) fprintf(stdout, "%d-", lastmatchnumber++); -- while (*pp != endlinebyte) pp++; -- fwrite(lastmatchrestart, 1, pp - lastmatchrestart + -- (1 + endlineextra), stdout); -- lastmatchrestart = pp + 1; -+ pp = end_of_line(pp, endptr, &ellength); -+ fwrite(lastmatchrestart, 1, pp - lastmatchrestart, stdout); -+ lastmatchrestart = pp; - } - if (lastmatchrestart != ptr) hyphenpending = TRUE; - } -@@ -754,8 +975,7 @@ - linecount < before_context) - { - linecount++; -- p--; -- while (p > buffer && p[-1] != endlinebyte) p--; -+ p = previous_line(p, buffer); - } - - if (lastmatchnumber > 0 && p > lastmatchrestart && !hyphenprinted) -@@ -763,12 +983,13 @@ - - while (p < ptr) - { -+ int ellength; - char *pp = p; - if (printname != NULL) fprintf(stdout, "%s-", printname); - if (number) fprintf(stdout, "%d-", linenumber - linecount--); -- while (*pp != endlinebyte) pp++; -- fwrite(p, 1, pp - p + (1 + endlineextra), stdout); -- p = pp + 1; -+ pp = end_of_line(pp, endptr, &ellength); -+ fwrite(p, 1, pp - p, stdout); -+ p = pp; - } - } - -@@ -788,11 +1009,16 @@ - - if (multiline) - { -+ int ellength; - char *endmatch = ptr + offsets[1]; - t = ptr; -- while (t < endmatch) { if (*t++ == endlinebyte) linenumber++; } -- while (endmatch < endptr && *endmatch != endlinebyte) endmatch++; -- linelength = endmatch - ptr; -+ while (t < endmatch) -+ { -+ t = end_of_line(t, endptr, &ellength); -+ if (t <= endmatch) linenumber++; else break; -+ } -+ endmatch = end_of_line(endmatch, endptr, &ellength); -+ linelength = endmatch - ptr - ellength; - } - - /*** NOTE: Use only fwrite() to output the data line, so that binary -@@ -824,9 +1050,7 @@ - fprintf(stdout, "%c[00m", 0x1b); - fwrite(ptr + offsets[1], 1, linelength - offsets[1], stdout); - } -- else fwrite(ptr, 1, linelength, stdout); -- -- fprintf(stdout, "\n"); -+ else fwrite(ptr, 1, linelength + endlinelength, stdout); - } - - /* End of doing what has to be done for a match */ -@@ -836,13 +1060,13 @@ - /* Remember where the last match happened for after_context. We remember - where we are about to restart, and that line's number. */ - -- lastmatchrestart = ptr + linelength + 1; -+ lastmatchrestart = ptr + linelength + endlinelength; - lastmatchnumber = linenumber + 1; - } - - /* Advance to after the newline and increment the line number. */ - -- ptr += linelength + 1; -+ ptr += linelength + endlinelength; - linenumber++; - - /* If we haven't yet reached the end of the file (the buffer is full), and -@@ -964,8 +1188,7 @@ - while ((nextfile = readdirectory(dir)) != NULL) - { - int frc, blen; -- sprintf(buffer, "%.512s%c%.128s", pathname, sep, nextfile); -- blen = strlen(buffer); -+ blen = slprintf(buffer, sizeof(buffer), "%.512s%c%.128s", pathname, sep, nextfile); - - if (exclude_compiled != NULL && - pcre_exec(exclude_compiled, NULL, buffer, blen, 0, 0, NULL, 0) >= 0) -@@ -1057,7 +1280,7 @@ - { - int n; - char s[4]; -- if (op->one_char > 0) sprintf(s, "-%c,", op->one_char); else strcpy(s, " "); -+ if (op->one_char > 0) snprintf(s, sizeof(s), "-%c,", op->one_char); else strcpy(s, " "); - printf(" %s --%s%n", s, op->long_name, &n); - n = 30 - n; - if (n < 1) n = 1; -@@ -1098,7 +1321,7 @@ - case 'q': quiet = TRUE; break; - case 'r': dee_action = dee_RECURSE; break; - case 's': silent = TRUE; break; -- case 'u': options |= PCRE_UTF8; break; -+ case 'u': options |= PCRE_UTF8; utf8 = TRUE; break; - case 'v': invert = TRUE; break; - case 'w': process_options |= PO_WORD_MATCH; break; - case 'x': process_options |= PO_LINE_MATCH; break; -@@ -1131,7 +1354,7 @@ - { - static char buffer[8]; - char *p = buffer; --sprintf(p, "%d", n); -+snprintf(p, sizeof(buffer), "%d", n); - while (*p != 0) p++; - switch (n%10) - { -@@ -1177,7 +1400,7 @@ - return FALSE; - } - --sprintf(buffer, "%s%.*s%s", prefix[process_options], MBUFTHIRD, pattern, -+snprintf(buffer, sizeof(buffer), "%s%.*s%s", prefix[process_options], MBUFTHIRD, pattern, - suffix[process_options]); - pattern_list[pattern_count] = - pcre_compile(buffer, options, &error, &errptr, pcretables); -@@ -1231,14 +1454,16 @@ - { - if ((process_options & PO_FIXED_STRINGS) != 0) - { -+ char *eop = pattern + strlen(pattern); - char buffer[MBUFTHIRD]; - for(;;) - { -- char *p = strchr(pattern, endlinebyte); -- if (p == NULL) -+ int ellength; -+ char *p = end_of_line(pattern, eop, &ellength); -+ if (ellength == 0) - return compile_single_pattern(pattern, options, filename, count); -- sprintf(buffer, "%.*s", p - pattern - endlineextra, pattern); -- pattern = p + 1; -+ snprintf(buffer, sizeof(buffer), "%.*s", p - pattern - ellength, pattern); -+ pattern = p; - if (!compile_single_pattern(buffer, options, filename, count)) - return FALSE; - } -@@ -1267,7 +1492,9 @@ - const char *locale_from = "--locale"; - const char *error; - --/* Set the default line ending value from the default in the PCRE library. */ -+/* Set the default line ending value from the default in the PCRE library; -+"lf", "cr", "crlf", and "any" are supported. Anything else is treated as "lf". -+*/ - - (void)pcre_config(PCRE_CONFIG_NEWLINE, &i); - switch(i) -@@ -1275,6 +1502,7 @@ - default: newline = (char *)"lf"; break; - case '\r': newline = (char *)"cr"; break; - case ('\r' << 8) | '\n': newline = (char *)"crlf"; break; -+ case -1: newline = (char *)"any"; break; - } - - /* Process the options */ -@@ -1350,8 +1578,8 @@ - char buff1[24]; - char buff2[24]; - int baselen = opbra - op->long_name; -- sprintf(buff1, "%.*s", baselen, op->long_name); -- sprintf(buff2, "%s%.*s", buff1, strlen(op->long_name) - baselen - 2, -+ snprintf(buff1, sizeof(buff1), "%.*s", baselen, op->long_name); -+ snprintf(buff2, sizeof(buff2), "%s%.*s", buff1, strlen(op->long_name) - baselen - 2, - opbra + 1); - if (strcmp(arg, buff1) == 0 || strcmp(arg, buff2) == 0) - break; -@@ -1565,16 +1793,22 @@ - if (strcmp(newline, "cr") == 0 || strcmp(newline, "CR") == 0) - { - pcre_options |= PCRE_NEWLINE_CR; -- endlinebyte = '\r'; -+ endlinetype = EL_CR; - } - else if (strcmp(newline, "lf") == 0 || strcmp(newline, "LF") == 0) - { - pcre_options |= PCRE_NEWLINE_LF; -+ endlinetype = EL_LF; - } - else if (strcmp(newline, "crlf") == 0 || strcmp(newline, "CRLF") == 0) - { - pcre_options |= PCRE_NEWLINE_CRLF; -- endlineextra = 1; -+ endlinetype = EL_CRLF; -+ } -+else if (strcmp(newline, "any") == 0 || strcmp(newline, "ANY") == 0) -+ { -+ pcre_options |= PCRE_NEWLINE_ANY; -+ endlinetype = EL_ANY; - } - else - { -@@ -1700,7 +1934,7 @@ - if (error != NULL) - { - char s[16]; -- if (pattern_count == 1) s[0] = 0; else sprintf(s, " number %d", j); -+ if (pattern_count == 1) s[0] = 0; else snprintf(s, sizeof(s), " number %d", j); - fprintf(stderr, "pcregrep: Error while studying regex%s: %s\n", s, error); - return 2; - } -diff -ruN ../pcre.orig/pcrelib/pcreposix.c ./pcrelib/pcreposix.c ---- ../pcre.orig/pcrelib/pcreposix.c Mon Jan 1 10:36:04 2007 -+++ ./pcrelib/pcreposix.c Sat Feb 24 04:30:55 2007 -@@ -6,7 +6,7 @@ - and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel -- Copyright (c) 1997-2007 University of Cambridge -+ Copyright (c) 1997-2006 University of Cambridge - - ----------------------------------------------------------------------------- - Redistribution and use in source and binary forms, with or without -@@ -78,7 +78,7 @@ - REG_BADPAT, /* unrecognized character after (?< */ - REG_BADPAT, /* lookbehind assertion is not fixed length */ - REG_BADPAT, /* malformed number or name after (?( */ -- REG_BADPAT, /* conditional group containe more than two branches */ -+ REG_BADPAT, /* conditional group contains more than two branches */ - REG_BADPAT, /* assertion expected after (?( */ - REG_BADPAT, /* (?R or (?digits must be followed by ) */ - REG_ECTYPE, /* unknown POSIX class name */ -@@ -93,7 +93,7 @@ - REG_BADPAT, /* closing ) for (?C expected */ - REG_BADPAT, /* recursive call could loop indefinitely */ - REG_BADPAT, /* unrecognized character after (?P */ -- REG_BADPAT, /* syntax error after (?P */ -+ REG_BADPAT, /* syntax error in subpattern name (missing terminator) */ - REG_BADPAT, /* two named subpatterns have the same name */ - REG_BADPAT, /* invalid UTF-8 string */ - REG_BADPAT, /* support for \P, \p, and \X has not been compiled */ -@@ -102,7 +102,13 @@ - REG_BADPAT, /* subpattern name is too long (maximum 32 characters) */ - REG_BADPAT, /* too many named subpatterns (maximum 10,000) */ - REG_BADPAT, /* repeated subpattern is too long */ -- REG_BADPAT /* octal value is greater than \377 (not in UTF-8 mode) */ -+ REG_BADPAT, /* octal value is greater than \377 (not in UTF-8 mode) */ -+ REG_BADPAT, /* internal error: overran compiling workspace */ -+ REG_BADPAT, /* internal error: previously-checked referenced subpattern not found */ -+ REG_BADPAT, /* DEFINE group contains more than one branch */ -+ REG_BADPAT, /* repeating a DEFINE group is not allowed */ -+ REG_INVARG, /* inconsistent NEWLINE options */ -+ REG_BADPAT /* \g is not followed followed by an (optionally braced) non-zero number */ - }; - - /* Table of texts corresponding to POSIX error codes */ -@@ -152,7 +158,7 @@ - if (errbuf_size > 0) - { - if (addlength > 0 && errbuf_size >= length + addlength) -- sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset); -+ snprintf(errbuf, errbuf_size, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset); - else - { - strncpy(errbuf, message, errbuf_size - 1); -diff -ruN ../pcre.orig/pcrelib/pcretest.c ./pcrelib/pcretest.c ---- ../pcre.orig/pcrelib/pcretest.c Wed Aug 30 22:00:22 2006 -+++ ./pcrelib/pcretest.c Fri Feb 9 22:31:20 2007 -@@ -44,10 +44,29 @@ - #include <locale.h> - #include <errno.h> - --#ifndef _WIN32 --#include <sys/resource.h> -+ -+/* A number of things vary for Windows builds. Originally, pcretest opened its -+input and output without "b"; then I was told that "b" was needed in some -+environments, so it was added for release 5.0 to both the input and output. (It -+makes no difference on Unix-like systems.) Later I was told that it is wrong -+for the input on Windows. I've now abstracted the modes into two macros that -+are set here, to make it easier to fiddle with them, and removed "b" from the -+input mode under Windows. */ -+ -+#if defined(_WIN32) || defined(WIN32) -+#include <io.h> /* For _setmode() */ -+#include <fcntl.h> /* For _O_BINARY */ -+#define INPUT_MODE "r" -+#define OUTPUT_MODE "wb" -+ -+#else -+#include <sys/time.h> /* These two includes are needed */ -+#include <sys/resource.h> /* for setrlimit(). */ -+#define INPUT_MODE "rb" -+#define OUTPUT_MODE "wb" - #endif - -+ - #define PCRE_SPY /* For Win32 build, import data, not export */ - - /* We include pcre_internal.h because we need the internal info for displaying -@@ -74,10 +93,18 @@ - - /* We also need the pcre_printint() function for printing out compiled - patterns. This function is in a separate file so that it can be included in --pcre_compile.c when that module is compiled with debugging enabled. */ -+pcre_compile.c when that module is compiled with debugging enabled. -+ -+The definition of the macro PRINTABLE, which determines whether to print an -+output character as-is or as a hex value when showing compiled patterns, is -+contained in this file. We uses it here also, in cases when the locale has not -+been explicitly changed, so as to get consistent output from systems that -+differ in their output from isprint() even in the "C" locale. */ - - #include "pcre_printint.src" - -+#define PRINTHEX(c) (locale_set? isprint(c) : PRINTABLE(c)) -+ - - /* It is possible to compile this test program without including support for - testing the POSIX interface, though this is not available via the standard -@@ -103,6 +130,8 @@ - #endif - #endif - -+/* This is the default loop count for timing. */ -+ - #define LOOPREPEAT 500000 - - /* Static variables */ -@@ -114,6 +143,7 @@ - static int callout_fail_count; - static int callout_fail_id; - static int first_callout; -+static int locale_set = 0; - static int show_malloc; - static int use_utf8; - static size_t gotten_store; -@@ -157,6 +187,7 @@ - for (;;) - { - int rlen = buffer_size - (here - buffer); -+ - if (rlen > 1000) - { - int dlen; -@@ -213,7 +244,7 @@ - - /* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess - around with conditional compilation, just do the job by hand. It is only used --for unpicking the -o argument, so just keep it simple. -+for unpicking arguments, so just keep it simple. - - Arguments: - str string to be converted -@@ -311,6 +342,8 @@ - Returns: number of characters placed in the buffer - */ - -+#if !defined NOUTF8 -+ - static int - ord2utf8(int cvalue, uschar *utf8bytes) - { -@@ -327,6 +360,8 @@ - return i + 1; - } - -+#endif -+ - - - /************************************************* -@@ -353,16 +388,19 @@ - { - length -= rc - 1; - p += rc; -- if (c < 256 && isprint(c)) -+ if (PRINTHEX(c)) - { - if (f != NULL) fprintf(f, "%c", c); - yield++; - } - else - { -- int n; -- if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n); -- yield += n; -+ int n = 4; -+ if (f != NULL) fprintf(f, "\\x{%02x}", c); -+ yield += (n <= 0x000000ff)? 2 : -+ (n <= 0x00000fff)? 3 : -+ (n <= 0x0000ffff)? 4 : -+ (n <= 0x000fffff)? 5 : 6; - } - continue; - } -@@ -371,7 +409,8 @@ - - /* Not UTF-8, or malformed UTF-8 */ - -- if (isprint(c = *(p++))) -+ c = *p++; -+ if (PRINTHEX(c)) - { - if (f != NULL) fprintf(f, "%c", c); - yield++; -@@ -614,7 +653,7 @@ - *************************************************/ - - /* This is used both at compile and run-time to check for <xxx> escapes, where --xxx is LF, CR, or CRLF. Print a message and return 0 if there is no match. -+xxx is LF, CR, CRLF, or ANY. Print a message and return 0 if there is no match. - - Arguments: - p points after the leading '<' -@@ -629,6 +668,7 @@ - if (strncmp((char *)p, "cr>", 3) == 0) return PCRE_NEWLINE_CR; - if (strncmp((char *)p, "lf>", 3) == 0) return PCRE_NEWLINE_LF; - if (strncmp((char *)p, "crlf>", 5) == 0) return PCRE_NEWLINE_CRLF; -+if (strncmp((char *)p, "any>", 4) == 0) return PCRE_NEWLINE_ANY; - fprintf(f, "Unknown newline type at: <%s\n", p); - return 0; - } -@@ -636,6 +676,38 @@ - - - /************************************************* -+* Usage function * -+*************************************************/ -+ -+static void -+usage(void) -+{ -+printf("Usage: pcretest [options] [<input> [<output>]]\n"); -+printf(" -b show compiled code (bytecode)\n"); -+printf(" -C show PCRE compile-time options and exit\n"); -+printf(" -d debug: show compiled code and information (-b and -i)\n"); -+#if !defined NODFA -+printf(" -dfa force DFA matching for all subjects\n"); -+#endif -+printf(" -help show usage information\n"); -+printf(" -i show information about compiled patterns\n" -+ " -m output memory used information\n" -+ " -o <n> set size of offsets vector to <n>\n"); -+#if !defined NOPOSIX -+printf(" -p use POSIX interface\n"); -+#endif -+printf(" -q quiet: do not output PCRE version number at start\n"); -+printf(" -S <n> set stack size to <n> megabytes\n"); -+printf(" -s output store (memory) used information\n" -+ " -t time compilation and execution\n"); -+printf(" -t <n> time compilation and execution, repeating <n> times\n"); -+printf(" -tm time execution (matching) only\n"); -+printf(" -tm <n> time execution (matching) only, repeating <n> times\n"); -+} -+ -+ -+ -+/************************************************* - * Main Program * - *************************************************/ - -@@ -650,6 +722,7 @@ - int study_options = 0; - int op = 1; - int timeit = 0; -+int timeitm = 0; - int showinfo = 0; - int showstore = 0; - int quiet = 0; -@@ -681,16 +754,19 @@ - dbuffer = (unsigned char *)malloc(buffer_size); - pbuffer = (unsigned char *)malloc(buffer_size); - --/* The outfile variable is static so that new_malloc can use it. The _setmode() --stuff is some magic that I don't understand, but which apparently does good --things in Windows. It's related to line terminations. */ -- --#if defined(_WIN32) || defined(WIN32) --_setmode( _fileno( stdout ), 0x8000 ); --#endif /* defined(_WIN32) || defined(WIN32) */ -+/* The outfile variable is static so that new_malloc can use it. */ - - outfile = stdout; - -+/* The following _setmode() stuff is some Windows magic that tells its runtime -+library to translate CRLF into a single LF character. At least, that's what -+I've been told: never having used Windows I take this all on trust. Originally -+it set 0x8000, but then I was advised that _O_BINARY was better. */ -+ -+#if defined(_WIN32) || defined(WIN32) -+_setmode( _fileno( stdout ), _O_BINARY ); -+#endif -+ - /* Scan options */ - - while (argc > 1 && argv[op][0] == '-') -@@ -699,8 +775,8 @@ - - if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0) - showstore = 1; -- else if (strcmp(argv[op], "-t") == 0) timeit = 1; - else if (strcmp(argv[op], "-q") == 0) quiet = 1; -+ else if (strcmp(argv[op], "-b") == 0) debug = 1; - else if (strcmp(argv[op], "-i") == 0) showinfo = 1; - else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1; - #if !defined NODFA -@@ -713,11 +789,25 @@ - op++; - argc--; - } -+ else if (strcmp(argv[op], "-t") == 0 || strcmp(argv[op], "-tm") == 0) -+ { -+ int both = argv[op][2] == 0; -+ int temp; -+ if (argc > 2 && (temp = get_value((unsigned char *)argv[op+1], &endptr), -+ *endptr == 0)) -+ { -+ timeitm = temp; -+ op++; -+ argc--; -+ } -+ else timeitm = LOOPREPEAT; -+ if (both) timeit = timeitm; -+ } - else if (strcmp(argv[op], "-S") == 0 && argc > 2 && - ((stack_size = get_value((unsigned char *)argv[op+1], &endptr)), - *endptr == 0)) - { --#ifdef _WIN32 -+#if defined(_WIN32) || defined(WIN32) - printf("PCRE: -S not supported on this OS\n"); - exit(1); - #else -@@ -749,7 +839,8 @@ - printf(" %sUnicode properties support\n", rc? "" : "No "); - (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc); - printf(" Newline sequence is %s\n", (rc == '\r')? "CR" : -- (rc == '\n')? "LF" : "CRLF"); -+ (rc == '\n')? "LF" : (rc == ('\r'<<8 | '\n'))? "CRLF" : -+ (rc == -1)? "ANY" : "???"); - (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc); - printf(" Internal link size = %d\n", rc); - (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc); -@@ -762,24 +853,16 @@ - printf(" Match recursion uses %s\n", rc? "stack" : "heap"); - exit(0); - } -+ else if (strcmp(argv[op], "-help") == 0 || -+ strcmp(argv[op], "--help") == 0) -+ { -+ usage(); -+ goto EXIT; -+ } - else - { - printf("** Unknown or malformed option %s\n", argv[op]); -- printf("Usage: pcretest [options] [<input> [<output>]]\n"); -- printf(" -C show PCRE compile-time options and exit\n"); -- printf(" -d debug: show compiled code; implies -i\n"); --#if !defined NODFA -- printf(" -dfa force DFA matching for all subjects\n"); --#endif -- printf(" -i show information about compiled pattern\n" -- " -m output memory used information\n" -- " -o <n> set size of offsets vector to <n>\n"); --#if !defined NOPOSIX -- printf(" -p use POSIX interface\n"); --#endif -- printf(" -S <n> set stack size to <n> megabytes\n"); -- printf(" -s output store (memory) used information\n" -- " -t time compilation and execution\n"); -+ usage(); - yield = 1; - goto EXIT; - } -@@ -803,7 +886,7 @@ - - if (argc > 1) - { -- infile = fopen(argv[op], "rb"); -+ infile = fopen(argv[op], INPUT_MODE); - if (infile == NULL) - { - printf("** Failed to open %s\n", argv[op]); -@@ -814,7 +897,7 @@ - - if (argc > 2) - { -- outfile = fopen(argv[op+1], "wb"); -+ outfile = fopen(argv[op+1], OUTPUT_MODE); - if (outfile == NULL) - { - printf("** Failed to open %s\n", argv[op+1]); -@@ -859,7 +942,7 @@ - int do_showinfo = showinfo; - int do_showrest = 0; - int do_flip = 0; -- int erroroffset, len, delimiter; -+ int erroroffset, len, delimiter, poffset; - - use_utf8 = 0; - -@@ -969,6 +1052,7 @@ - } - - pp = p; -+ poffset = p - buffer; - - for(;;) - { -@@ -989,6 +1073,11 @@ - if (infile != stdin) fprintf(outfile, "%s", (char *)pp); - } - -+ /* The buffer may have moved while being extended; reset the start of data -+ pointer to the correct relative point in the buffer. */ -+ -+ p = buffer + poffset; -+ - /* If the first character after the delimiter is backslash, make - the pattern end with backslash. This is purely to provide a way - of testing for the error message when a pattern ends with backslash. */ -@@ -1020,6 +1109,7 @@ - - case '+': do_showrest = 1; break; - case 'A': options |= PCRE_ANCHORED; break; -+ case 'B': do_debug = 1; break; - case 'C': options |= PCRE_AUTO_CALLOUT; break; - case 'D': do_debug = do_showinfo = 1; break; - case 'E': options |= PCRE_DOLLAR_ENDONLY; break; -@@ -1042,14 +1132,16 @@ - - case 'L': - ppp = pp; -- /* The '\r' test here is so that it works on Windows */ -- while (*ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++; -+ /* The '\r' test here is so that it works on Windows. */ -+ /* The '0' test is just in case this is an unterminated line. */ -+ while (*ppp != 0 && *ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++; - *ppp = 0; - if (setlocale(LC_CTYPE, (const char *)pp) == NULL) - { - fprintf(outfile, "** Failed to set locale \"%s\"\n", pp); - goto SKIP_DATA; - } -+ locale_set = 1; - tables = pcre_maketables(); - pp = ppp; - break; -@@ -1116,19 +1208,19 @@ - #endif /* !defined NOPOSIX */ - - { -- if (timeit) -+ if (timeit > 0) - { - register int i; - clock_t time_taken; - clock_t start_time = clock(); -- for (i = 0; i < LOOPREPEAT; i++) -+ for (i = 0; i < timeit; i++) - { - re = pcre_compile((char *)p, options, &error, &erroroffset, tables); - if (re != NULL) free(re); - } - time_taken = clock() - start_time; -- fprintf(outfile, "Compile time %.3f milliseconds\n", -- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) / -+ fprintf(outfile, "Compile time %.4f milliseconds\n", -+ (((double)time_taken * 1000.0) / (double)timeit) / - (double)CLOCKS_PER_SEC); - } - -@@ -1180,17 +1272,17 @@ - - if (do_study) - { -- if (timeit) -+ if (timeit > 0) - { - register int i; - clock_t time_taken; - clock_t start_time = clock(); -- for (i = 0; i < LOOPREPEAT; i++) -+ for (i = 0; i < timeit; i++) - extra = pcre_study(re, study_options, &error); - time_taken = clock() - start_time; - if (extra != NULL) free(extra); -- fprintf(outfile, " Study time %.3f milliseconds\n", -- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) / -+ fprintf(outfile, " Study time %.4f milliseconds\n", -+ (((double)time_taken * 1000.0) / (double)timeit) / - (double)CLOCKS_PER_SEC); - } - extra = pcre_study(re, study_options, &error); -@@ -1233,6 +1325,12 @@ - - SHOW_INFO: - -+ if (do_debug) -+ { -+ fprintf(outfile, "------------------------------------------------------------------\n"); -+ pcre_printint(re, outfile); -+ } -+ - if (do_showinfo) - { - unsigned long int get_options, all_options; -@@ -1243,12 +1341,6 @@ - int nameentrysize, namecount; - const uschar *nametable; - -- if (do_debug) -- { -- fprintf(outfile, "------------------------------------------------------------------\n"); -- pcre_printint(re, outfile); -- } -- - new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options); - new_info(re, NULL, PCRE_INFO_SIZE, &size); - new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count); -@@ -1327,7 +1419,7 @@ - ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "", - ((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : ""); - -- switch (get_options & PCRE_NEWLINE_CRLF) -+ switch (get_options & PCRE_NEWLINE_BITS) - { - case PCRE_NEWLINE_CR: - fprintf(outfile, "Forced newline sequence: CR\n"); -@@ -1341,6 +1433,10 @@ - fprintf(outfile, "Forced newline sequence: CRLF\n"); - break; - -+ case PCRE_NEWLINE_ANY: -+ fprintf(outfile, "Forced newline sequence: ANY\n"); -+ break; -+ - default: - break; - } -@@ -1358,7 +1454,7 @@ - int ch = first_char & 255; - const char *caseless = ((first_char & REQ_CASELESS) == 0)? - "" : " (caseless)"; -- if (isprint(ch)) -+ if (PRINTHEX(ch)) - fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless); - else - fprintf(outfile, "First char = %d%s\n", ch, caseless); -@@ -1373,7 +1469,7 @@ - int ch = need_char & 255; - const char *caseless = ((need_char & REQ_CASELESS) == 0)? - "" : " (caseless)"; -- if (isprint(ch)) -+ if (PRINTHEX(ch)) - fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless); - else - fprintf(outfile, "Need char = %d%s\n", ch, caseless); -@@ -1409,7 +1505,7 @@ - fprintf(outfile, "\n "); - c = 2; - } -- if (isprint(i) && i != ' ') -+ if (PRINTHEX(i) && i != ' ') - { - fprintf(outfile, "%c ", i); - c += 2; -@@ -1468,6 +1564,7 @@ - strerror(errno)); - } - else fprintf(outfile, "Study data written to %s\n", to_file); -+ - } - } - fclose(f); -@@ -1866,7 +1963,7 @@ - - for (;; gmatched++) /* Loop for /g or /G */ - { -- if (timeit) -+ if (timeitm > 0) - { - register int i; - clock_t time_taken; -@@ -1876,7 +1973,7 @@ - if (all_use_dfa || use_dfa) - { - int workspace[1000]; -- for (i = 0; i < LOOPREPEAT; i++) -+ for (i = 0; i < timeitm; i++) - count = pcre_dfa_exec(re, NULL, (char *)bptr, len, start_offset, - options | g_notempty, use_offsets, use_size_offsets, workspace, - sizeof(workspace)/sizeof(int)); -@@ -1884,13 +1981,13 @@ - else - #endif - -- for (i = 0; i < LOOPREPEAT; i++) -+ for (i = 0; i < timeitm; i++) - count = pcre_exec(re, extra, (char *)bptr, len, - start_offset, options | g_notempty, use_offsets, use_size_offsets); - - time_taken = clock() - start_time; -- fprintf(outfile, "Execute time %.3f milliseconds\n", -- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) / -+ fprintf(outfile, "Execute time %.4f milliseconds\n", -+ (((double)time_taken * 1000.0) / (double)timeitm) / - (double)CLOCKS_PER_SEC); - } - -@@ -1966,7 +2063,28 @@ - - if (count >= 0) - { -- int i; -+ int i, maxcount; -+ -+#if !defined NODFA -+ if (all_use_dfa || use_dfa) maxcount = use_size_offsets/2; else -+#endif -+ maxcount = use_size_offsets/3; -+ -+ /* This is a check against a lunatic return value. */ -+ -+ if (count > maxcount) -+ { -+ fprintf(outfile, -+ "** PCRE error: returned count %d is too big for offset size %d\n", -+ count, use_size_offsets); -+ count = use_size_offsets/3; -+ if (do_g || do_G) -+ { -+ fprintf(outfile, "** /%c loop abandoned\n", do_g? 'g' : 'G'); -+ do_g = do_G = FALSE; /* Break g/G loop */ -+ } -+ } -+ - for (i = 0; i < count * 2; i += 2) - { - if (use_offsets[i] < 0) -@@ -2165,6 +2283,7 @@ - { - new_free((void *)tables); - setlocale(LC_CTYPE, "C"); -+ locale_set = 0; - } - } - -diff -ruN ../pcre.orig/pcrelib/ucp.h ./pcrelib/ucp.h ---- ../pcre.orig/pcrelib/ucp.h Mon Mar 6 22:45:57 2006 -+++ ./pcrelib/ucp.h Fri Feb 9 22:31:20 2007 -@@ -6,7 +6,9 @@ - #define _UCP_H - - /* This file contains definitions of the property values that are returned by --the function _pcre_ucp_findprop(). */ -+the function _pcre_ucp_findprop(). New values that are added for new releases -+of Unicode should always be at the end of each enum, for backwards -+compatibility. */ - - /* These are the general character categories. */ - -@@ -118,7 +120,12 @@ - ucp_Tibetan, - ucp_Tifinagh, - ucp_Ugaritic, -- ucp_Yi -+ ucp_Yi, -+ ucp_Balinese, /* New for Unicode 5.0.0 */ -+ ucp_Cuneiform, /* New for Unicode 5.0.0 */ -+ ucp_Nko, /* New for Unicode 5.0.0 */ -+ ucp_Phags_Pa, /* New for Unicode 5.0.0 */ -+ ucp_Phoenician /* New for Unicode 5.0.0 */ - }; - - #endif -diff -ruN ../pcre.orig/pcrelib/ucpinternal.h ./pcrelib/ucpinternal.h ---- ../pcre.orig/pcrelib/ucpinternal.h Mon Mar 6 22:45:57 2006 -+++ ./pcrelib/ucpinternal.h Fri Feb 9 22:31:20 2007 -@@ -2,6 +2,9 @@ - * Unicode Property Table handler * - *************************************************/ - -+#ifndef _UCPINTERNAL_H -+#define _UCPINTERNAL_H -+ - /* Internal header file defining the layout of the bits in each pair of 32-bit - words that form a data item in the table. */ - -@@ -83,5 +86,7 @@ - (7) Otherwise, set the bottom to one element past the current point and goto - (2). - */ -+ -+#endif /* _UCPINTERNAL_H */ - - /* End of ucpinternal.h */ -diff -ruN ../pcre.orig/pcrelib/ucptable.c ./pcrelib/ucptable.c ---- ../pcre.orig/pcrelib/ucptable.c Mon Mar 6 22:45:57 2006 -+++ ./pcrelib/ucptable.c Fri Feb 9 22:31:20 2007 -@@ -1,5 +1,6 @@ - /* This source module is automatically generated from the Unicode --property table. See ucpinternal.h for a description of the layout. */ -+property table. See ucpinternal.h for a description of the layout. -+This version was made from the Unicode 5.0.0 tables. */ - - static cnode ucp_table[] = { - { 0x09800000, 0x0000001f }, -@@ -298,7 +299,7 @@ - { 0x2100017d, 0x24000001 }, - { 0x2100017e, 0x1400ffff }, - { 0x2100017f, 0x1400fed4 }, -- { 0x21000180, 0x14000000 }, -+ { 0x21000180, 0x140000c3 }, - { 0x21000181, 0x240000d2 }, - { 0x21000182, 0x24000001 }, - { 0x21000183, 0x1400ffff }, -@@ -475,13 +476,27 @@ - { 0x21000232, 0x24000001 }, - { 0x21000233, 0x1400ffff }, - { 0x21800234, 0x14000005 }, -- { 0x2100023a, 0x24000000 }, -+ { 0x2100023a, 0x24002a2b }, - { 0x2100023b, 0x24000001 }, - { 0x2100023c, 0x1400ffff }, - { 0x2100023d, 0x2400ff5d }, -- { 0x2100023e, 0x24000000 }, -+ { 0x2100023e, 0x24002a28 }, - { 0x2180023f, 0x14000001 }, -- { 0x21000241, 0x24000053 }, -+ { 0x21000241, 0x24000001 }, -+ { 0x21000242, 0x1400ffff }, -+ { 0x21000243, 0x2400ff3d }, -+ { 0x21000244, 0x24000045 }, -+ { 0x21000245, 0x24000047 }, -+ { 0x21000246, 0x24000001 }, -+ { 0x21000247, 0x1400ffff }, -+ { 0x21000248, 0x24000001 }, -+ { 0x21000249, 0x1400ffff }, -+ { 0x2100024a, 0x24000001 }, -+ { 0x2100024b, 0x1400ffff }, -+ { 0x2100024c, 0x24000001 }, -+ { 0x2100024d, 0x1400ffff }, -+ { 0x2100024e, 0x24000001 }, -+ { 0x2100024f, 0x1400ffff }, - { 0x21800250, 0x14000002 }, - { 0x21000253, 0x1400ff2e }, - { 0x21000254, 0x1400ff32 }, -@@ -499,25 +514,30 @@ - { 0x21800264, 0x14000003 }, - { 0x21000268, 0x1400ff2f }, - { 0x21000269, 0x1400ff2d }, -- { 0x2180026a, 0x14000004 }, -+ { 0x2100026a, 0x14000000 }, -+ { 0x2100026b, 0x140029f7 }, -+ { 0x2180026c, 0x14000002 }, - { 0x2100026f, 0x1400ff2d }, - { 0x21800270, 0x14000001 }, - { 0x21000272, 0x1400ff2b }, - { 0x21800273, 0x14000001 }, - { 0x21000275, 0x1400ff2a }, -- { 0x21800276, 0x14000009 }, -+ { 0x21800276, 0x14000006 }, -+ { 0x2100027d, 0x140029e7 }, -+ { 0x2180027e, 0x14000001 }, - { 0x21000280, 0x1400ff26 }, - { 0x21800281, 0x14000001 }, - { 0x21000283, 0x1400ff26 }, - { 0x21800284, 0x14000003 }, - { 0x21000288, 0x1400ff26 }, -- { 0x21000289, 0x14000000 }, -+ { 0x21000289, 0x1400ffbb }, - { 0x2100028a, 0x1400ff27 }, - { 0x2100028b, 0x1400ff27 }, -- { 0x2180028c, 0x14000005 }, -+ { 0x2100028c, 0x1400ffb9 }, -+ { 0x2180028d, 0x14000004 }, - { 0x21000292, 0x1400ff25 }, - { 0x21000293, 0x14000000 }, -- { 0x21000294, 0x1400ffad }, -+ { 0x21000294, 0x1c000000 }, - { 0x21800295, 0x1400001a }, - { 0x218002b0, 0x18000011 }, - { 0x098002c2, 0x60000003 }, -@@ -532,6 +552,9 @@ - { 0x1b800346, 0x30000029 }, - { 0x13800374, 0x60000001 }, - { 0x1300037a, 0x18000000 }, -+ { 0x1300037b, 0x14000082 }, -+ { 0x1300037c, 0x14000082 }, -+ { 0x1300037d, 0x14000082 }, - { 0x0900037e, 0x54000000 }, - { 0x13800384, 0x60000001 }, - { 0x13000386, 0x24000026 }, -@@ -647,7 +670,9 @@ - { 0x130003fa, 0x24000001 }, - { 0x130003fb, 0x1400ffff }, - { 0x130003fc, 0x14000000 }, -- { 0x138003fd, 0x24000002 }, -+ { 0x130003fd, 0x2400ff7e }, -+ { 0x130003fe, 0x2400ff7e }, -+ { 0x130003ff, 0x2400ff7e }, - { 0x0c000400, 0x24000050 }, - { 0x0c000401, 0x24000050 }, - { 0x0c000402, 0x24000050 }, -@@ -835,7 +860,7 @@ - { 0x0c0004bd, 0x1400ffff }, - { 0x0c0004be, 0x24000001 }, - { 0x0c0004bf, 0x1400ffff }, -- { 0x0c0004c0, 0x24000000 }, -+ { 0x0c0004c0, 0x2400000f }, - { 0x0c0004c1, 0x24000001 }, - { 0x0c0004c2, 0x1400ffff }, - { 0x0c0004c3, 0x24000001 }, -@@ -850,6 +875,7 @@ - { 0x0c0004cc, 0x1400ffff }, - { 0x0c0004cd, 0x24000001 }, - { 0x0c0004ce, 0x1400ffff }, -+ { 0x0c0004cf, 0x1400fff1 }, - { 0x0c0004d0, 0x24000001 }, - { 0x0c0004d1, 0x1400ffff }, - { 0x0c0004d2, 0x24000001 }, -@@ -892,6 +918,12 @@ - { 0x0c0004f7, 0x1400ffff }, - { 0x0c0004f8, 0x24000001 }, - { 0x0c0004f9, 0x1400ffff }, -+ { 0x0c0004fa, 0x24000001 }, -+ { 0x0c0004fb, 0x1400ffff }, -+ { 0x0c0004fc, 0x24000001 }, -+ { 0x0c0004fd, 0x1400ffff }, -+ { 0x0c0004fe, 0x24000001 }, -+ { 0x0c0004ff, 0x1400ffff }, - { 0x0c000500, 0x24000001 }, - { 0x0c000501, 0x1400ffff }, - { 0x0c000502, 0x24000001 }, -@@ -908,6 +940,10 @@ - { 0x0c00050d, 0x1400ffff }, - { 0x0c00050e, 0x24000001 }, - { 0x0c00050f, 0x1400ffff }, -+ { 0x0c000510, 0x24000001 }, -+ { 0x0c000511, 0x1400ffff }, -+ { 0x0c000512, 0x24000001 }, -+ { 0x0c000513, 0x1400ffff }, - { 0x01000531, 0x24000030 }, - { 0x01000532, 0x24000030 }, - { 0x01000533, 0x24000030 }, -@@ -989,8 +1025,7 @@ - { 0x01000587, 0x14000000 }, - { 0x09000589, 0x54000000 }, - { 0x0100058a, 0x44000000 }, -- { 0x19800591, 0x30000028 }, -- { 0x198005bb, 0x30000002 }, -+ { 0x19800591, 0x3000002c }, - { 0x190005be, 0x54000000 }, - { 0x190005bf, 0x30000000 }, - { 0x190005c0, 0x54000000 }, -@@ -1043,6 +1078,13 @@ - { 0x37800780, 0x1c000025 }, - { 0x378007a6, 0x3000000a }, - { 0x370007b1, 0x1c000000 }, -+ { 0x3f8007c0, 0x34000009 }, -+ { 0x3f8007ca, 0x1c000020 }, -+ { 0x3f8007eb, 0x30000008 }, -+ { 0x3f8007f4, 0x18000001 }, -+ { 0x3f0007f6, 0x68000000 }, -+ { 0x3f8007f7, 0x54000002 }, -+ { 0x3f0007fa, 0x18000000 }, - { 0x0e800901, 0x30000001 }, - { 0x0e000903, 0x28000000 }, - { 0x0e800904, 0x1c000035 }, -@@ -1059,7 +1101,7 @@ - { 0x09800964, 0x54000001 }, - { 0x0e800966, 0x34000009 }, - { 0x09000970, 0x54000000 }, -- { 0x0e00097d, 0x1c000000 }, -+ { 0x0e80097b, 0x1c000004 }, - { 0x02000981, 0x30000000 }, - { 0x02800982, 0x28000001 }, - { 0x02800985, 0x1c000007 }, -@@ -1203,7 +1245,9 @@ - { 0x1c800cd5, 0x28000001 }, - { 0x1c000cde, 0x1c000000 }, - { 0x1c800ce0, 0x1c000001 }, -+ { 0x1c800ce2, 0x30000001 }, - { 0x1c800ce6, 0x34000009 }, -+ { 0x1c800cf1, 0x68000001 }, - { 0x24800d02, 0x28000001 }, - { 0x24800d05, 0x1c000007 }, - { 0x24800d0e, 0x1c000002 }, -@@ -1452,13 +1496,33 @@ - { 0x05801a17, 0x30000001 }, - { 0x05801a19, 0x28000002 }, - { 0x05801a1e, 0x54000001 }, -+ { 0x3d801b00, 0x30000003 }, -+ { 0x3d001b04, 0x28000000 }, -+ { 0x3d801b05, 0x1c00002e }, -+ { 0x3d001b34, 0x30000000 }, -+ { 0x3d001b35, 0x28000000 }, -+ { 0x3d801b36, 0x30000004 }, -+ { 0x3d001b3b, 0x28000000 }, -+ { 0x3d001b3c, 0x30000000 }, -+ { 0x3d801b3d, 0x28000004 }, -+ { 0x3d001b42, 0x30000000 }, -+ { 0x3d801b43, 0x28000001 }, -+ { 0x3d801b45, 0x1c000006 }, -+ { 0x3d801b50, 0x34000009 }, -+ { 0x3d801b5a, 0x54000006 }, -+ { 0x3d801b61, 0x68000009 }, -+ { 0x3d801b6b, 0x30000008 }, -+ { 0x3d801b74, 0x68000008 }, - { 0x21801d00, 0x1400002b }, - { 0x21801d2c, 0x18000035 }, - { 0x21801d62, 0x14000015 }, - { 0x0c001d78, 0x18000000 }, -- { 0x21801d79, 0x14000021 }, -+ { 0x21801d79, 0x14000003 }, -+ { 0x21001d7d, 0x14000ee6 }, -+ { 0x21801d7e, 0x1400001c }, - { 0x21801d9b, 0x18000024 }, -- { 0x1b801dc0, 0x30000003 }, -+ { 0x1b801dc0, 0x3000000a }, -+ { 0x1b801dfe, 0x30000001 }, - { 0x21001e00, 0x24000001 }, - { 0x21001e01, 0x1400ffff }, - { 0x21001e02, 0x24000001 }, -@@ -1967,7 +2031,7 @@ - { 0x1b8020dd, 0x2c000003 }, - { 0x1b0020e1, 0x30000000 }, - { 0x1b8020e2, 0x2c000002 }, -- { 0x1b8020e5, 0x30000006 }, -+ { 0x1b8020e5, 0x3000000a }, - { 0x09802100, 0x68000001 }, - { 0x09002102, 0x24000000 }, - { 0x09802103, 0x68000003 }, -@@ -1995,7 +2059,7 @@ - { 0x0900212e, 0x68000000 }, - { 0x0900212f, 0x14000000 }, - { 0x09802130, 0x24000001 }, -- { 0x09002132, 0x68000000 }, -+ { 0x21002132, 0x2400001c }, - { 0x09002133, 0x24000000 }, - { 0x09002134, 0x14000000 }, - { 0x09802135, 0x1c000003 }, -@@ -2008,7 +2072,8 @@ - { 0x09802146, 0x14000003 }, - { 0x0900214a, 0x68000000 }, - { 0x0900214b, 0x64000000 }, -- { 0x0900214c, 0x68000000 }, -+ { 0x0980214c, 0x68000001 }, -+ { 0x2100214e, 0x1400ffe4 }, - { 0x09802153, 0x3c00000c }, - { 0x09002160, 0x38000010 }, - { 0x09002161, 0x38000010 }, -@@ -2042,7 +2107,9 @@ - { 0x0900217d, 0x3800fff0 }, - { 0x0900217e, 0x3800fff0 }, - { 0x0900217f, 0x3800fff0 }, -- { 0x09802180, 0x38000003 }, -+ { 0x09802180, 0x38000002 }, -+ { 0x09002183, 0x24000001 }, -+ { 0x21002184, 0x1400ffff }, - { 0x09802190, 0x64000004 }, - { 0x09802195, 0x68000004 }, - { 0x0980219a, 0x64000001 }, -@@ -2073,10 +2140,9 @@ - { 0x0900237c, 0x64000000 }, - { 0x0980237d, 0x6800001d }, - { 0x0980239b, 0x64000018 }, -- { 0x090023b4, 0x58000000 }, -- { 0x090023b5, 0x48000000 }, -- { 0x090023b6, 0x54000000 }, -- { 0x098023b7, 0x68000024 }, -+ { 0x098023b4, 0x68000027 }, -+ { 0x098023dc, 0x64000005 }, -+ { 0x098023e2, 0x68000005 }, - { 0x09802400, 0x68000026 }, - { 0x09802440, 0x6800000a }, - { 0x09802460, 0x3c00003b }, -@@ -2143,7 +2209,7 @@ - { 0x09802600, 0x6800006e }, - { 0x0900266f, 0x64000000 }, - { 0x09802670, 0x6800002c }, -- { 0x098026a0, 0x68000011 }, -+ { 0x098026a0, 0x68000012 }, - { 0x09802701, 0x68000003 }, - { 0x09802706, 0x68000003 }, - { 0x0980270c, 0x6800001b }, -@@ -2174,6 +2240,7 @@ - { 0x098027c0, 0x64000004 }, - { 0x090027c5, 0x58000000 }, - { 0x090027c6, 0x48000000 }, -+ { 0x098027c7, 0x64000003 }, - { 0x098027d0, 0x64000015 }, - { 0x090027e6, 0x58000000 }, - { 0x090027e7, 0x48000000 }, -@@ -2215,7 +2282,8 @@ - { 0x090029fc, 0x58000000 }, - { 0x090029fd, 0x48000000 }, - { 0x098029fe, 0x64000101 }, -- { 0x09802b00, 0x68000013 }, -+ { 0x09802b00, 0x6800001a }, -+ { 0x09802b20, 0x68000003 }, - { 0x11002c00, 0x24000030 }, - { 0x11002c01, 0x24000030 }, - { 0x11002c02, 0x24000030 }, -@@ -2310,6 +2378,23 @@ - { 0x11002c5c, 0x1400ffd0 }, - { 0x11002c5d, 0x1400ffd0 }, - { 0x11002c5e, 0x1400ffd0 }, -+ { 0x21002c60, 0x24000001 }, -+ { 0x21002c61, 0x1400ffff }, -+ { 0x21002c62, 0x2400d609 }, -+ { 0x21002c63, 0x2400f11a }, -+ { 0x21002c64, 0x2400d619 }, -+ { 0x21002c65, 0x1400d5d5 }, -+ { 0x21002c66, 0x1400d5d8 }, -+ { 0x21002c67, 0x24000001 }, -+ { 0x21002c68, 0x1400ffff }, -+ { 0x21002c69, 0x24000001 }, -+ { 0x21002c6a, 0x1400ffff }, -+ { 0x21002c6b, 0x24000001 }, -+ { 0x21002c6c, 0x1400ffff }, -+ { 0x21002c74, 0x14000000 }, -+ { 0x21002c75, 0x24000001 }, -+ { 0x21002c76, 0x1400ffff }, -+ { 0x21002c77, 0x14000000 }, - { 0x0a002c80, 0x24000001 }, - { 0x0a002c81, 0x1400ffff }, - { 0x0a002c82, 0x24000001 }, -@@ -2559,6 +2644,8 @@ - { 0x3c80a016, 0x1c000476 }, - { 0x3c80a490, 0x68000036 }, - { 0x0980a700, 0x60000016 }, -+ { 0x0980a717, 0x18000003 }, -+ { 0x0980a720, 0x60000001 }, - { 0x3080a800, 0x1c000001 }, - { 0x3000a802, 0x28000000 }, - { 0x3080a803, 0x1c000002 }, -@@ -2570,6 +2657,8 @@ - { 0x3080a825, 0x30000001 }, - { 0x3000a827, 0x28000000 }, - { 0x3080a828, 0x68000003 }, -+ { 0x4080a840, 0x1c000033 }, -+ { 0x4080a874, 0x54000003 }, - { 0x1780ac00, 0x1c002ba3 }, - { 0x0980d800, 0x1000037f }, - { 0x0980db80, 0x1000007f }, -@@ -2765,13 +2854,15 @@ - { 0x1301018a, 0x3c000000 }, - { 0x29810300, 0x1c00001e }, - { 0x29810320, 0x3c000003 }, -- { 0x12810330, 0x1c000019 }, -+ { 0x12810330, 0x1c000010 }, -+ { 0x12010341, 0x38000000 }, -+ { 0x12810342, 0x1c000007 }, - { 0x1201034a, 0x38000000 }, - { 0x3b810380, 0x1c00001d }, - { 0x3b01039f, 0x54000000 }, - { 0x2a8103a0, 0x1c000023 }, - { 0x2a8103c8, 0x1c000007 }, -- { 0x2a0103d0, 0x68000000 }, -+ { 0x2a0103d0, 0x54000000 }, - { 0x2a8103d1, 0x38000004 }, - { 0x0d010400, 0x24000028 }, - { 0x0d010401, 0x24000028 }, -@@ -2861,6 +2952,9 @@ - { 0x0b810837, 0x1c000001 }, - { 0x0b01083c, 0x1c000000 }, - { 0x0b01083f, 0x1c000000 }, -+ { 0x41810900, 0x1c000015 }, -+ { 0x41810916, 0x3c000003 }, -+ { 0x4101091f, 0x54000000 }, - { 0x1e010a00, 0x1c000000 }, - { 0x1e810a01, 0x30000002 }, - { 0x1e810a05, 0x30000001 }, -@@ -2872,6 +2966,9 @@ - { 0x1e010a3f, 0x30000000 }, - { 0x1e810a40, 0x3c000007 }, - { 0x1e810a50, 0x54000008 }, -+ { 0x3e812000, 0x1c00036e }, -+ { 0x3e812400, 0x38000062 }, -+ { 0x3e812470, 0x54000003 }, - { 0x0981d000, 0x680000f5 }, - { 0x0981d100, 0x68000026 }, - { 0x0981d12a, 0x6800003a }, -@@ -2890,6 +2987,7 @@ - { 0x1381d242, 0x30000002 }, - { 0x1301d245, 0x68000000 }, - { 0x0981d300, 0x68000056 }, -+ { 0x0981d360, 0x3c000011 }, - { 0x0981d400, 0x24000019 }, - { 0x0981d41a, 0x14000019 }, - { 0x0981d434, 0x24000019 }, -@@ -2957,6 +3055,8 @@ - { 0x0981d7aa, 0x14000018 }, - { 0x0901d7c3, 0x64000000 }, - { 0x0981d7c4, 0x14000005 }, -+ { 0x0901d7ca, 0x24000000 }, -+ { 0x0901d7cb, 0x14000000 }, - { 0x0981d7ce, 0x34000031 }, - { 0x16820000, 0x1c00a6d6 }, - { 0x1682f800, 0x1c00021d }, |