From 6247f7406311e6b59676be460a1e9394a5a1fe8f Mon Sep 17 00:00:00 2001
From: ru <ru@FreeBSD.org>
Date: Wed, 30 Jul 2003 06:47:03 +0000
Subject: Vendor import of bwk's 29-Jul-2003 release.

---
 contrib/one-true-awk/FIXES  | 46 +++++++++++++++++++++++++++++++++++++++++++++
 contrib/one-true-awk/b.c    | 33 ++++++++------------------------
 contrib/one-true-awk/lex.c  |  2 ++
 contrib/one-true-awk/main.c |  7 +++----
 contrib/one-true-awk/run.c  |  2 +-
 5 files changed, 60 insertions(+), 30 deletions(-)

(limited to 'contrib/one-true-awk')

diff --git a/contrib/one-true-awk/FIXES b/contrib/one-true-awk/FIXES
index bf9381b..296a2c9 100644
--- a/contrib/one-true-awk/FIXES
+++ b/contrib/one-true-awk/FIXES
@@ -25,6 +25,52 @@ THIS SOFTWARE.
 This file lists all bug fixes, changes, etc., made since the AWK book
 was sent to the printers in August, 1987.
 
+Jul 29, 2003:
+	fixed (i think) the long-standing botch that included the beginning of
+	line state ^ for RE's in the set of valid characters; this led to a
+	variety of odd problems, including failure to properly match certain
+	regular expressions in non-US locales.  thanks to ruslan for keeping
+	at this one.
+
+Jul 28, 2003:
+	n-th try at getting internationalization right, with thanks to volker
+	kiefel, arnold robbins and ruslan ermilov for advice, though they
+	should not be blamed for the outcome.  according to posix, "."  is the
+	radix character in programs and command line arguments regardless of
+	the locale; otherwise, the locale should prevail for input and output
+	of numbers.  so it's intended to work that way.
+	
+	i have rescinded the attempt to use strcoll in expanding shorthands in
+	regular expressions (cclenter).  its properties are much too
+	surprising; for example [a-c] matches aAbBc in locale en_US but abBcC
+	in locale fr_CA.  i can see how this might arise by implementation
+	but i cannot explain it to a human user.  (this behavior can be seen
+	in gawk as well; we're leaning on the same library.)
+
+	the issue appears to be that strcoll is meant for sorting, where
+	merging upper and lower case may make sense (though note that unix
+	sort does not do this by default either).  it is not appropriate
+	for regular expressions, where the goal is to match specific
+	patterns of characters.  in any case, the notations [:lower:], etc.,
+	are available in awk, and they are more likely to work correctly in
+	most locales.
+
+	a moratorium is hereby declared on internationalization changes.
+	i apologize to friends and colleagues in other parts of the world.
+	i would truly like to get this "right", but i don't know what
+	that is, and i do not want to keep making changes until it's clear.
+
+Jul 4, 2003:
+	fixed bug that permitted non-terminated RE, as in "awk /x".
+
+Jun 1, 2003:
+	subtle change to split: if source is empty, number of elems
+	is always 0 and the array is not set.
+
+Mar 21, 2003:
+	added some parens to isblank, in another attempt to make things
+	internationally portable.
+
 Mar 14, 2003:
 	the internationalization changes, somewhat modified, are now
 	reinstated.  in theory awk will now do character comparisons
diff --git a/contrib/one-true-awk/b.c b/contrib/one-true-awk/b.c
index df3aaa9..0f949be 100644
--- a/contrib/one-true-awk/b.c
+++ b/contrib/one-true-awk/b.c
@@ -33,7 +33,7 @@ THIS SOFTWARE.
 #include "awk.h"
 #include "ytab.h"
 
-#define	HAT	(NCHARS-2)	/* matches ^ in regular expr */
+#define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
 				/* NCHARS is 2**n */
 #define MAXLIN 22
 
@@ -282,24 +282,9 @@ int quoted(char **pp)	/* pick up next thing after a \\ */
 	return c;
 }
 
-static int collate_range_cmp(int a, int b)
-{
-	int r;
-	static char s[2][2];
-
-	if ((uschar)a == (uschar)b)
-		return 0;
-	s[0][0] = a;
-	s[1][0] = b;
-	if ((r = strcoll(s[0], s[1])) == 0)
-		r = (uschar)a - (uschar)b;
-	return r;
-}
-
 char *cclenter(const char *argp)	/* add a character class */
 {
 	int i, c, c2;
-	int j;
 	uschar *p = (uschar *) argp;
 	uschar *op, *bp;
 	static uschar *buf = 0;
@@ -318,18 +303,15 @@ char *cclenter(const char *argp)	/* add a character class */
 				c2 = *p++;
 				if (c2 == '\\')
 					c2 = quoted((char **) &p);
-				if (collate_range_cmp(c, c2) > 0) {	/* empty; ignore */
+				if (c > c2) {	/* empty; ignore */
 					bp--;
 					i--;
 					continue;
 				}
-				for (j = 0; j < NCHARS; j++) {
-					if ((collate_range_cmp(c, j) > 0) ||
-					    collate_range_cmp(j, c2) > 0)
-						continue;
+				while (c < c2) {
 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
 						FATAL("out of space for character class [%.10s...] 2", p);
-					*bp++ = j;
+					*bp++ = ++c;
 					i++;
 				}
 				continue;
@@ -718,11 +700,14 @@ Node *unary(Node *np)
  * system i use, it's defined here.  if some other locale has a richer
  * definition of "blank", define HAS_ISBLANK and provide your own
  * version.
+ * the parentheses here are an attempt to find a path through the maze
+ * of macro definition and/or function and/or version provided.  thanks
+ * to nelson beebe for the suggestion; let's see if it works everywhere.
  */
 
 #ifndef HAS_ISBLANK
 
-int isblank(int c)
+int (isblank)(int c)
 {
 	return c==' ' || c=='\t';
 }
@@ -839,8 +824,6 @@ int cgoto(fa *f, int s, int c)
 	int i, j, k;
 	int *p, *q;
 
-	if (c < 0 || c > 255)
-		FATAL("can't happen: neg char %d in cgoto", c);
 	while (f->accept >= maxsetvec) {	/* guessing here! */
 		maxsetvec *= 4;
 		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
diff --git a/contrib/one-true-awk/lex.c b/contrib/one-true-awk/lex.c
index e4b1fd3..39f5d4d 100644
--- a/contrib/one-true-awk/lex.c
+++ b/contrib/one-true-awk/lex.c
@@ -529,6 +529,8 @@ int regexpr(void)
 		}
 	}
 	*bp = 0;
+	if (c == 0)
+		SYNTAX("non-terminated regular expression %.10s...", buf);
 	yylval.s = tostring(buf);
 	unput('/');
 	RET(REGEXPR);
diff --git a/contrib/one-true-awk/main.c b/contrib/one-true-awk/main.c
index df855dd..6e6604e 100644
--- a/contrib/one-true-awk/main.c
+++ b/contrib/one-true-awk/main.c
@@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 THIS SOFTWARE.
 ****************************************************************/
 
-const char	*version = "version 20030314";
+const char	*version = "version 20030729";
 
 #define DEBUG
 #include <stdio.h>
@@ -55,10 +55,8 @@ int main(int argc, char *argv[])
 {
 	const char *fs = NULL;
 
-	setlocale(LC_ALL, "");
-	setlocale(LC_COLLATE, "");
 	setlocale(LC_CTYPE, "");
-	setlocale(LC_MESSAGES, "");
+	setlocale(LC_NUMERIC, "C"); /* for parsing cmdline & prog */
 	cmdname = argv[0];
 	if (argc == 1) {
 		fprintf(stderr, "Usage: %s [-f programfile | 'program'] [-Ffieldsep] [-v var=value] [files]\n", cmdname);
@@ -147,6 +145,7 @@ int main(int argc, char *argv[])
 	if (!safe)
 		envinit(environ);
 	yyparse();
+	setlocale(LC_NUMERIC, ""); /* back to whatever it is locally */
 	if (fs)
 		*FS = qstring(fs, '\0');
 	   dprintf( ("errorflag=%d\n", errorflag) );
diff --git a/contrib/one-true-awk/run.c b/contrib/one-true-awk/run.c
index 617ac7d..066cb01 100644
--- a/contrib/one-true-awk/run.c
+++ b/contrib/one-true-awk/run.c
@@ -1221,7 +1221,7 @@ Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
 	ap->sval = (char *) makesymtab(NSYMTAB);
 
 	n = 0;
-	if ((*s != '\0' && strlen(fs) > 1) || arg3type == REGEXPR) {	/* reg expr */
+	if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) {	/* reg expr */
 		fa *pfa;
 		if (arg3type == REGEXPR) {	/* it's ready already */
 			pfa = (fa *) a[2];
-- 
cgit v1.1