From 6247f7406311e6b59676be460a1e9394a5a1fe8f Mon Sep 17 00:00:00 2001 From: ru Date: Wed, 30 Jul 2003 06:47:03 +0000 Subject: Vendor import of bwk's 29-Jul-2003 release. --- contrib/one-true-awk/FIXES | 46 +++++++++++++++++++++++++++++++++++++++++++++ contrib/one-true-awk/b.c | 33 ++++++++------------------------ contrib/one-true-awk/lex.c | 2 ++ contrib/one-true-awk/main.c | 7 +++---- contrib/one-true-awk/run.c | 2 +- 5 files changed, 60 insertions(+), 30 deletions(-) (limited to 'contrib/one-true-awk') diff --git a/contrib/one-true-awk/FIXES b/contrib/one-true-awk/FIXES index bf9381b..296a2c9 100644 --- a/contrib/one-true-awk/FIXES +++ b/contrib/one-true-awk/FIXES @@ -25,6 +25,52 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the AWK book was sent to the printers in August, 1987. +Jul 29, 2003: + fixed (i think) the long-standing botch that included the beginning of + line state ^ for RE's in the set of valid characters; this led to a + variety of odd problems, including failure to properly match certain + regular expressions in non-US locales. thanks to ruslan for keeping + at this one. + +Jul 28, 2003: + n-th try at getting internationalization right, with thanks to volker + kiefel, arnold robbins and ruslan ermilov for advice, though they + should not be blamed for the outcome. according to posix, "." is the + radix character in programs and command line arguments regardless of + the locale; otherwise, the locale should prevail for input and output + of numbers. so it's intended to work that way. + + i have rescinded the attempt to use strcoll in expanding shorthands in + regular expressions (cclenter). its properties are much too + surprising; for example [a-c] matches aAbBc in locale en_US but abBcC + in locale fr_CA. i can see how this might arise by implementation + but i cannot explain it to a human user. (this behavior can be seen + in gawk as well; we're leaning on the same library.) + + the issue appears to be that strcoll is meant for sorting, where + merging upper and lower case may make sense (though note that unix + sort does not do this by default either). it is not appropriate + for regular expressions, where the goal is to match specific + patterns of characters. in any case, the notations [:lower:], etc., + are available in awk, and they are more likely to work correctly in + most locales. + + a moratorium is hereby declared on internationalization changes. + i apologize to friends and colleagues in other parts of the world. + i would truly like to get this "right", but i don't know what + that is, and i do not want to keep making changes until it's clear. + +Jul 4, 2003: + fixed bug that permitted non-terminated RE, as in "awk /x". + +Jun 1, 2003: + subtle change to split: if source is empty, number of elems + is always 0 and the array is not set. + +Mar 21, 2003: + added some parens to isblank, in another attempt to make things + internationally portable. + Mar 14, 2003: the internationalization changes, somewhat modified, are now reinstated. in theory awk will now do character comparisons diff --git a/contrib/one-true-awk/b.c b/contrib/one-true-awk/b.c index df3aaa9..0f949be 100644 --- a/contrib/one-true-awk/b.c +++ b/contrib/one-true-awk/b.c @@ -33,7 +33,7 @@ THIS SOFTWARE. #include "awk.h" #include "ytab.h" -#define HAT (NCHARS-2) /* matches ^ in regular expr */ +#define HAT (NCHARS+2) /* matches ^ in regular expr */ /* NCHARS is 2**n */ #define MAXLIN 22 @@ -282,24 +282,9 @@ int quoted(char **pp) /* pick up next thing after a \\ */ return c; } -static int collate_range_cmp(int a, int b) -{ - int r; - static char s[2][2]; - - if ((uschar)a == (uschar)b) - return 0; - s[0][0] = a; - s[1][0] = b; - if ((r = strcoll(s[0], s[1])) == 0) - r = (uschar)a - (uschar)b; - return r; -} - char *cclenter(const char *argp) /* add a character class */ { int i, c, c2; - int j; uschar *p = (uschar *) argp; uschar *op, *bp; static uschar *buf = 0; @@ -318,18 +303,15 @@ char *cclenter(const char *argp) /* add a character class */ c2 = *p++; if (c2 == '\\') c2 = quoted((char **) &p); - if (collate_range_cmp(c, c2) > 0) { /* empty; ignore */ + if (c > c2) { /* empty; ignore */ bp--; i--; continue; } - for (j = 0; j < NCHARS; j++) { - if ((collate_range_cmp(c, j) > 0) || - collate_range_cmp(j, c2) > 0) - continue; + while (c < c2) { if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0)) FATAL("out of space for character class [%.10s...] 2", p); - *bp++ = j; + *bp++ = ++c; i++; } continue; @@ -718,11 +700,14 @@ Node *unary(Node *np) * system i use, it's defined here. if some other locale has a richer * definition of "blank", define HAS_ISBLANK and provide your own * version. + * the parentheses here are an attempt to find a path through the maze + * of macro definition and/or function and/or version provided. thanks + * to nelson beebe for the suggestion; let's see if it works everywhere. */ #ifndef HAS_ISBLANK -int isblank(int c) +int (isblank)(int c) { return c==' ' || c=='\t'; } @@ -839,8 +824,6 @@ int cgoto(fa *f, int s, int c) int i, j, k; int *p, *q; - if (c < 0 || c > 255) - FATAL("can't happen: neg char %d in cgoto", c); while (f->accept >= maxsetvec) { /* guessing here! */ maxsetvec *= 4; setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); diff --git a/contrib/one-true-awk/lex.c b/contrib/one-true-awk/lex.c index e4b1fd3..39f5d4d 100644 --- a/contrib/one-true-awk/lex.c +++ b/contrib/one-true-awk/lex.c @@ -529,6 +529,8 @@ int regexpr(void) } } *bp = 0; + if (c == 0) + SYNTAX("non-terminated regular expression %.10s...", buf); yylval.s = tostring(buf); unput('/'); RET(REGEXPR); diff --git a/contrib/one-true-awk/main.c b/contrib/one-true-awk/main.c index df855dd..6e6604e 100644 --- a/contrib/one-true-awk/main.c +++ b/contrib/one-true-awk/main.c @@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20030314"; +const char *version = "version 20030729"; #define DEBUG #include @@ -55,10 +55,8 @@ int main(int argc, char *argv[]) { const char *fs = NULL; - setlocale(LC_ALL, ""); - setlocale(LC_COLLATE, ""); setlocale(LC_CTYPE, ""); - setlocale(LC_MESSAGES, ""); + setlocale(LC_NUMERIC, "C"); /* for parsing cmdline & prog */ cmdname = argv[0]; if (argc == 1) { fprintf(stderr, "Usage: %s [-f programfile | 'program'] [-Ffieldsep] [-v var=value] [files]\n", cmdname); @@ -147,6 +145,7 @@ int main(int argc, char *argv[]) if (!safe) envinit(environ); yyparse(); + setlocale(LC_NUMERIC, ""); /* back to whatever it is locally */ if (fs) *FS = qstring(fs, '\0'); dprintf( ("errorflag=%d\n", errorflag) ); diff --git a/contrib/one-true-awk/run.c b/contrib/one-true-awk/run.c index 617ac7d..066cb01 100644 --- a/contrib/one-true-awk/run.c +++ b/contrib/one-true-awk/run.c @@ -1221,7 +1221,7 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */ ap->sval = (char *) makesymtab(NSYMTAB); n = 0; - if ((*s != '\0' && strlen(fs) > 1) || arg3type == REGEXPR) { /* reg expr */ + if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) { /* reg expr */ fa *pfa; if (arg3type == REGEXPR) { /* it's ready already */ pfa = (fa *) a[2]; -- cgit v1.1