From db66ea27a00160961db238cc97c52107e5bc0e83 Mon Sep 17 00:00:00 2001
From: tjr <tjr@FreeBSD.org>
Date: Sun, 11 Jul 2004 05:58:31 +0000
Subject: Remove incomplete support for multi-character collating elements.
 Remove unused character category calculations.

---
 lib/libc/regex/regcomp.c | 253 ++---------------------------------------------
 lib/libc/regex/regex2.h  |  17 ----
 2 files changed, 8 insertions(+), 262 deletions(-)

(limited to 'lib/libc/regex')

diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index e342fda..95a2874 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -106,17 +106,6 @@ static void freeset(struct parse *p, cset *cs);
 static int freezeset(struct parse *p, cset *cs);
 static int firstch(struct parse *p, cset *cs);
 static int nch(struct parse *p, cset *cs);
-static void mcadd(struct parse *p, cset *cs, char *cp) __unused;
-#if used
-static void mcsub(cset *cs, char *cp);
-static int mcin(cset *cs, char *cp);
-static char *mcfind(cset *cs, char *cp);
-#endif
-static void mcinvert(struct parse *p, cset *cs);
-static void mccase(struct parse *p, cset *cs);
-static int isinsets(struct re_guts *g, int c);
-static int samesets(struct re_guts *g, int c1, int c2);
-static void categorize(struct parse *p, struct re_guts *g);
 static sopno dupl(struct parse *p, sopno start, sopno finish);
 static void doemit(struct parse *p, sop op, size_t opnd);
 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
@@ -124,7 +113,7 @@ static void dofwd(struct parse *p, sopno pos, sop value);
 static void enlarge(struct parse *p, sopno size);
 static void stripsnug(struct parse *p, struct re_guts *g);
 static void findmust(struct parse *p, struct re_guts *g);
-static int altoffset(sop *scan, int offset, int mccs);
+static int altoffset(sop *scan, int offset);
 static void computejumps(struct parse *p, struct re_guts *g);
 static void computematchjumps(struct parse *p, struct re_guts *g);
 static sopno pluscount(struct parse *p, struct re_guts *g);
@@ -216,8 +205,7 @@ int cflags;
 		len = strlen((char *)pattern);
 
 	/* do the mallocs early so failure handling is easy */
-	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
-							(NC-1)*sizeof(cat_t));
+	g = (struct re_guts *)malloc(sizeof(struct re_guts));
 	if (g == NULL)
 		return(REG_ESPACE);
 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
@@ -252,9 +240,6 @@ int cflags;
 	g->matchjump = NULL;
 	g->mlen = 0;
 	g->nsub = 0;
-	g->ncategories = 1;	/* category 0 is "everything else" */
-	g->categories = &g->catspace[-(CHAR_MIN)];
-	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
 	g->backrefs = 0;
 
 	/* do it */
@@ -270,7 +255,6 @@ int cflags;
 	g->laststate = THERE();
 
 	/* tidy up loose ends and fill things in */
-	categorize(p, g);
 	stripsnug(p, g);
 	findmust(p, g);
 	/* only use Boyer-Moore algorithm if the pattern is bigger
@@ -516,9 +500,7 @@ struct parse *p;
  * Giving end1 as OUT essentially eliminates the end1/end2 check.
  *
  * This implementation is a bit of a kludge, in that a trailing $ is first
- * taken as an ordinary character and then revised to be an anchor.  The
- * only undesirable side effect is that '$' gets included as a character
- * category in such cases.  This is fairly harmless; not worth fixing.
+ * taken as an ordinary character and then revised to be an anchor.
  * The amount of lookahead needed to avoid this kludge is excessive.
  */
 static void
@@ -739,8 +721,6 @@ struct parse *p;
 				if (ci != i)
 					CHadd(cs, ci);
 			}
-		if (cs->multis != NULL)
-			mccase(p, cs);
 	}
 	if (invert) {
 		int i;
@@ -752,12 +732,8 @@ struct parse *p;
 				CHadd(cs, i);
 		if (p->g->cflags&REG_NEWLINE)
 			CHsub(cs, '\n');
-		if (cs->multis != NULL)
-			mcinvert(p, cs);
 	}
 
-	assert(cs->multis == NULL);		/* xxx */
-
 	if (nch(p, cs) == 1) {		/* optimize singleton sets */
 		ordinary(p, firstch(p, cs));
 		freeset(p, cs);
@@ -812,7 +788,6 @@ cset *cs;
 		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 		break;
 	default:		/* symbol, ordinary character, or range */
-/* xxx revision needed for multichar stuff */
 		start = p_b_symbol(p);
 		if (SEE('-') && MORE2() && PEEK2() != ']') {
 			/* range */
@@ -932,10 +907,6 @@ cset *cs;
 				CHadd(cs, c);
 		break;
 	}
-#if 0
-	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
-		MCadd(p, cs, u);
-#endif
 }
 
 /*
@@ -1059,15 +1030,11 @@ ordinary(p, ch)
 struct parse *p;
 int ch;
 {
-	cat_t *cap = p->g->categories;
 
 	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
 		bothcases(p, ch);
-	else {
+	else
 		EMIT(OCHAR, (uch)ch);
-		if (cap[ch] == 0)
-			cap[ch] = p->g->ncategories++;
-	}
 }
 
 /*
@@ -1233,8 +1200,6 @@ struct parse *p;
 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
 	cs->mask = 1 << ((no) % CHAR_BIT);
 	cs->hash = 0;
-	cs->smultis = 0;
-	cs->multis = NULL;
 
 	return(cs);
 }
@@ -1337,193 +1302,6 @@ cset *cs;
 }
 
 /*
- - mcadd - add a collating element to a cset
- == static void mcadd(struct parse *p, cset *cs, \
- ==	char *cp);
- */
-static void
-mcadd(p, cs, cp)
-struct parse *p;
-cset *cs;
-char *cp;
-{
-	size_t oldend = cs->smultis;
-
-	cs->smultis += strlen(cp) + 1;
-	if (cs->multis == NULL)
-		cs->multis = malloc(cs->smultis);
-	else
-		cs->multis = reallocf(cs->multis, cs->smultis);
-	if (cs->multis == NULL) {
-		SETERROR(REG_ESPACE);
-		return;
-	}
-
-	(void) strcpy(cs->multis + oldend - 1, cp);
-	cs->multis[cs->smultis - 1] = '\0';
-}
-
-#if used
-/*
- - mcsub - subtract a collating element from a cset
- == static void mcsub(cset *cs, char *cp);
- */
-static void
-mcsub(cs, cp)
-cset *cs;
-char *cp;
-{
-	char *fp = mcfind(cs, cp);
-	size_t len = strlen(fp);
-
-	assert(fp != NULL);
-	(void) memmove(fp, fp + len + 1,
-				cs->smultis - (fp + len + 1 - cs->multis));
-	cs->smultis -= len;
-
-	if (cs->smultis == 0) {
-		free(cs->multis);
-		cs->multis = NULL;
-		return;
-	}
-
-	cs->multis = reallocf(cs->multis, cs->smultis);
-	assert(cs->multis != NULL);
-}
-
-/*
- - mcin - is a collating element in a cset?
- == static int mcin(cset *cs, char *cp);
- */
-static int
-mcin(cs, cp)
-cset *cs;
-char *cp;
-{
-	return(mcfind(cs, cp) != NULL);
-}
-
-/*
- - mcfind - find a collating element in a cset
- == static char *mcfind(cset *cs, char *cp);
- */
-static char *
-mcfind(cs, cp)
-cset *cs;
-char *cp;
-{
-	char *p;
-
-	if (cs->multis == NULL)
-		return(NULL);
-	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
-		if (strcmp(cp, p) == 0)
-			return(p);
-	return(NULL);
-}
-#endif
-
-/*
- - mcinvert - invert the list of collating elements in a cset
- == static void mcinvert(struct parse *p, cset *cs);
- *
- * This would have to know the set of possibilities.  Implementation
- * is deferred.
- */
-static void
-mcinvert(p, cs)
-struct parse *p;
-cset *cs;
-{
-	assert(cs->multis == NULL);	/* xxx */
-}
-
-/*
- - mccase - add case counterparts of the list of collating elements in a cset
- == static void mccase(struct parse *p, cset *cs);
- *
- * This would have to know the set of possibilities.  Implementation
- * is deferred.
- */
-static void
-mccase(p, cs)
-struct parse *p;
-cset *cs;
-{
-	assert(cs->multis == NULL);	/* xxx */
-}
-
-/*
- - isinsets - is this character in any sets?
- == static int isinsets(struct re_guts *g, int c);
- */
-static int			/* predicate */
-isinsets(g, c)
-struct re_guts *g;
-int c;
-{
-	uch *col;
-	int i;
-	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
-	unsigned uc = (uch)c;
-
-	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
-		if (col[uc] != 0)
-			return(1);
-	return(0);
-}
-
-/*
- - samesets - are these two characters in exactly the same sets?
- == static int samesets(struct re_guts *g, int c1, int c2);
- */
-static int			/* predicate */
-samesets(g, c1, c2)
-struct re_guts *g;
-int c1;
-int c2;
-{
-	uch *col;
-	int i;
-	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
-	unsigned uc1 = (uch)c1;
-	unsigned uc2 = (uch)c2;
-
-	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
-		if (col[uc1] != col[uc2])
-			return(0);
-	return(1);
-}
-
-/*
- - categorize - sort out character categories
- == static void categorize(struct parse *p, struct re_guts *g);
- */
-static void
-categorize(p, g)
-struct parse *p;
-struct re_guts *g;
-{
-	cat_t *cats = g->categories;
-	int c;
-	int c2;
-	cat_t cat;
-
-	/* avoid making error situations worse */
-	if (p->error != 0)
-		return;
-
-	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
-		if (cats[c] == 0 && isinsets(g, c)) {
-			cat = g->ncategories++;
-			cats[c] = cat;
-			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
-				if (cats[c2] == 0 && samesets(g, c, c2))
-					cats[c2] = cat;
-		}
-}
-
-/*
  - dupl - emit a duplicate of a bunch of sops
  == static sopno dupl(struct parse *p, sopno start, sopno finish);
  */
@@ -1698,18 +1476,11 @@ struct re_guts *g;
 	char *cp;
 	sopno i;
 	int offset;
-	int cs, mccs;
 
 	/* avoid making error situations worse */
 	if (p->error != 0)
 		return;
 
-	/* Find out if we can handle OANYOF or not */
-	mccs = 0;
-	for (cs = 0; cs < g->ncsets; cs++)
-		if (g->sets[cs].multis != NULL)
-			mccs = 1;
-
 	/* find the longest OCHAR sequence in strip */
 	newlen = 0;
 	offset = 0;
@@ -1729,7 +1500,7 @@ struct re_guts *g;
 			break;
 		case OQUEST_:		/* things that must be skipped */
 		case OCH_:
-			offset = altoffset(scan, offset, mccs);
+			offset = altoffset(scan, offset);
 			scan--;
 			do {
 				scan += OPND(s);
@@ -1797,11 +1568,6 @@ struct re_guts *g;
 			if (offset > -1)
 				offset++;
 			newlen = 0;
-			/* And, now, if we found out we can't deal with
-			 * it, make offset = -1.
-			 */
-			if (mccs)
-				offset = -1;
 			break;
 		default:
 			/* Anything here makes it impossible or too hard
@@ -1849,16 +1615,15 @@ struct re_guts *g;
 
 /*
  - altoffset - choose biggest offset among multiple choices
- == static int altoffset(sop *scan, int offset, int mccs);
+ == static int altoffset(sop *scan, int offset);
  *
  * Compute, recursively if necessary, the largest offset among multiple
  * re paths.
  */
 static int
-altoffset(scan, offset, mccs)
+altoffset(scan, offset)
 sop *scan;
 int offset;
-int mccs;
 {
 	int largest;
 	int try;
@@ -1880,7 +1645,7 @@ int mccs;
 			break;
 		case OQUEST_:
 		case OCH_:
-			try = altoffset(scan, try, mccs);
+			try = altoffset(scan, try);
 			if (try == -1)
 				return -1;
 			scan--;
@@ -1897,8 +1662,6 @@ int mccs;
 			scan++;
 			break;
 		case OANYOF:
-			if (mccs)
-				return -1;
 		case OCHAR:
 		case OANY:
 			try++;
diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h
index 8c645ce..4678824 100644
--- a/lib/libc/regex/regex2.h
+++ b/lib/libc/regex/regex2.h
@@ -113,29 +113,16 @@ typedef long sopno;
  * The individual set therefore has both a pointer to the byte vector
  * and a mask to pick out the relevant bit of each byte.  A hash code
  * simplifies testing whether two sets could be identical.
- *
- * This will get trickier for multicharacter collating elements.  As
- * preliminary hooks for dealing with such things, we also carry along
- * a string of multi-character elements, and decide the size of the
- * vectors at run time.
  */
 typedef struct {
 	uch *ptr;		/* -> uch [csetsize] */
 	uch mask;		/* bit within array */
 	short hash;             /* hash code */
-	size_t smultis;
-	char *multis;		/* -> char[smulti]  ab\0cd\0ef\0\0 */
 } cset;
 /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
 #define CHadd(cs, c)    ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (uch)(c))
 #define CHsub(cs, c)    ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (uch)(c))
 #define	CHIN(cs, c)	((cs)->ptr[(uch)(c)] & (cs)->mask)
-#define	MCadd(p, cs, cp)	mcadd(p, cs, cp)	/* regcomp() internal fns */
-#define	MCsub(p, cs, cp)	mcsub(p, cs, cp)
-#define	MCin(p, cs, cp)	mcin(p, cs, cp)
-
-/* stuff for character categories */
-typedef unsigned char cat_t;
 
 /*
  * main compiled-expression structure
@@ -158,8 +145,6 @@ struct re_guts {
 #		define	BAD	04	/* something wrong */
 	int nbol;		/* number of ^ used */
 	int neol;		/* number of $ used */
-	int ncategories;	/* how many character categories */
-	cat_t *categories;	/* ->catspace[-CHAR_MIN] */
 	char *must;		/* match must contain this string */
 	int moffset;		/* latest point at which must may be located */
 	int *charjump;		/* Boyer-Moore char jump table */
@@ -168,8 +153,6 @@ struct re_guts {
 	size_t nsub;		/* copy of re_nsub */
 	int backrefs;		/* does it use back references? */
 	sopno nplus;		/* how deep does it nest +s? */
-	/* catspace must be last */
-	cat_t catspace[1];	/* actually [NC] */
 };
 
 /* misc utilities */
-- 
cgit v1.1