From db66ea27a00160961db238cc97c52107e5bc0e83 Mon Sep 17 00:00:00 2001 From: tjr Date: Sun, 11 Jul 2004 05:58:31 +0000 Subject: Remove incomplete support for multi-character collating elements. Remove unused character category calculations. --- lib/libc/regex/regcomp.c | 253 ++--------------------------------------------- lib/libc/regex/regex2.h | 17 ---- 2 files changed, 8 insertions(+), 262 deletions(-) (limited to 'lib/libc/regex') diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c index e342fda..95a2874 100644 --- a/lib/libc/regex/regcomp.c +++ b/lib/libc/regex/regcomp.c @@ -106,17 +106,6 @@ static void freeset(struct parse *p, cset *cs); static int freezeset(struct parse *p, cset *cs); static int firstch(struct parse *p, cset *cs); static int nch(struct parse *p, cset *cs); -static void mcadd(struct parse *p, cset *cs, char *cp) __unused; -#if used -static void mcsub(cset *cs, char *cp); -static int mcin(cset *cs, char *cp); -static char *mcfind(cset *cs, char *cp); -#endif -static void mcinvert(struct parse *p, cset *cs); -static void mccase(struct parse *p, cset *cs); -static int isinsets(struct re_guts *g, int c); -static int samesets(struct re_guts *g, int c1, int c2); -static void categorize(struct parse *p, struct re_guts *g); static sopno dupl(struct parse *p, sopno start, sopno finish); static void doemit(struct parse *p, sop op, size_t opnd); static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); @@ -124,7 +113,7 @@ static void dofwd(struct parse *p, sopno pos, sop value); static void enlarge(struct parse *p, sopno size); static void stripsnug(struct parse *p, struct re_guts *g); static void findmust(struct parse *p, struct re_guts *g); -static int altoffset(sop *scan, int offset, int mccs); +static int altoffset(sop *scan, int offset); static void computejumps(struct parse *p, struct re_guts *g); static void computematchjumps(struct parse *p, struct re_guts *g); static sopno pluscount(struct parse *p, struct re_guts *g); @@ -216,8 +205,7 @@ int cflags; len = strlen((char *)pattern); /* do the mallocs early so failure handling is easy */ - g = (struct re_guts *)malloc(sizeof(struct re_guts) + - (NC-1)*sizeof(cat_t)); + g = (struct re_guts *)malloc(sizeof(struct re_guts)); if (g == NULL) return(REG_ESPACE); p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ @@ -252,9 +240,6 @@ int cflags; g->matchjump = NULL; g->mlen = 0; g->nsub = 0; - g->ncategories = 1; /* category 0 is "everything else" */ - g->categories = &g->catspace[-(CHAR_MIN)]; - (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); g->backrefs = 0; /* do it */ @@ -270,7 +255,6 @@ int cflags; g->laststate = THERE(); /* tidy up loose ends and fill things in */ - categorize(p, g); stripsnug(p, g); findmust(p, g); /* only use Boyer-Moore algorithm if the pattern is bigger @@ -516,9 +500,7 @@ struct parse *p; * Giving end1 as OUT essentially eliminates the end1/end2 check. * * This implementation is a bit of a kludge, in that a trailing $ is first - * taken as an ordinary character and then revised to be an anchor. The - * only undesirable side effect is that '$' gets included as a character - * category in such cases. This is fairly harmless; not worth fixing. + * taken as an ordinary character and then revised to be an anchor. * The amount of lookahead needed to avoid this kludge is excessive. */ static void @@ -739,8 +721,6 @@ struct parse *p; if (ci != i) CHadd(cs, ci); } - if (cs->multis != NULL) - mccase(p, cs); } if (invert) { int i; @@ -752,12 +732,8 @@ struct parse *p; CHadd(cs, i); if (p->g->cflags®_NEWLINE) CHsub(cs, '\n'); - if (cs->multis != NULL) - mcinvert(p, cs); } - assert(cs->multis == NULL); /* xxx */ - if (nch(p, cs) == 1) { /* optimize singleton sets */ ordinary(p, firstch(p, cs)); freeset(p, cs); @@ -812,7 +788,6 @@ cset *cs; (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); break; default: /* symbol, ordinary character, or range */ -/* xxx revision needed for multichar stuff */ start = p_b_symbol(p); if (SEE('-') && MORE2() && PEEK2() != ']') { /* range */ @@ -932,10 +907,6 @@ cset *cs; CHadd(cs, c); break; } -#if 0 - for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) - MCadd(p, cs, u); -#endif } /* @@ -1059,15 +1030,11 @@ ordinary(p, ch) struct parse *p; int ch; { - cat_t *cap = p->g->categories; if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) bothcases(p, ch); - else { + else EMIT(OCHAR, (uch)ch); - if (cap[ch] == 0) - cap[ch] = p->g->ncategories++; - } } /* @@ -1233,8 +1200,6 @@ struct parse *p; cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); cs->mask = 1 << ((no) % CHAR_BIT); cs->hash = 0; - cs->smultis = 0; - cs->multis = NULL; return(cs); } @@ -1337,193 +1302,6 @@ cset *cs; } /* - - mcadd - add a collating element to a cset - == static void mcadd(struct parse *p, cset *cs, \ - == char *cp); - */ -static void -mcadd(p, cs, cp) -struct parse *p; -cset *cs; -char *cp; -{ - size_t oldend = cs->smultis; - - cs->smultis += strlen(cp) + 1; - if (cs->multis == NULL) - cs->multis = malloc(cs->smultis); - else - cs->multis = reallocf(cs->multis, cs->smultis); - if (cs->multis == NULL) { - SETERROR(REG_ESPACE); - return; - } - - (void) strcpy(cs->multis + oldend - 1, cp); - cs->multis[cs->smultis - 1] = '\0'; -} - -#if used -/* - - mcsub - subtract a collating element from a cset - == static void mcsub(cset *cs, char *cp); - */ -static void -mcsub(cs, cp) -cset *cs; -char *cp; -{ - char *fp = mcfind(cs, cp); - size_t len = strlen(fp); - - assert(fp != NULL); - (void) memmove(fp, fp + len + 1, - cs->smultis - (fp + len + 1 - cs->multis)); - cs->smultis -= len; - - if (cs->smultis == 0) { - free(cs->multis); - cs->multis = NULL; - return; - } - - cs->multis = reallocf(cs->multis, cs->smultis); - assert(cs->multis != NULL); -} - -/* - - mcin - is a collating element in a cset? - == static int mcin(cset *cs, char *cp); - */ -static int -mcin(cs, cp) -cset *cs; -char *cp; -{ - return(mcfind(cs, cp) != NULL); -} - -/* - - mcfind - find a collating element in a cset - == static char *mcfind(cset *cs, char *cp); - */ -static char * -mcfind(cs, cp) -cset *cs; -char *cp; -{ - char *p; - - if (cs->multis == NULL) - return(NULL); - for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) - if (strcmp(cp, p) == 0) - return(p); - return(NULL); -} -#endif - -/* - - mcinvert - invert the list of collating elements in a cset - == static void mcinvert(struct parse *p, cset *cs); - * - * This would have to know the set of possibilities. Implementation - * is deferred. - */ -static void -mcinvert(p, cs) -struct parse *p; -cset *cs; -{ - assert(cs->multis == NULL); /* xxx */ -} - -/* - - mccase - add case counterparts of the list of collating elements in a cset - == static void mccase(struct parse *p, cset *cs); - * - * This would have to know the set of possibilities. Implementation - * is deferred. - */ -static void -mccase(p, cs) -struct parse *p; -cset *cs; -{ - assert(cs->multis == NULL); /* xxx */ -} - -/* - - isinsets - is this character in any sets? - == static int isinsets(struct re_guts *g, int c); - */ -static int /* predicate */ -isinsets(g, c) -struct re_guts *g; -int c; -{ - uch *col; - int i; - int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; - unsigned uc = (uch)c; - - for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) - if (col[uc] != 0) - return(1); - return(0); -} - -/* - - samesets - are these two characters in exactly the same sets? - == static int samesets(struct re_guts *g, int c1, int c2); - */ -static int /* predicate */ -samesets(g, c1, c2) -struct re_guts *g; -int c1; -int c2; -{ - uch *col; - int i; - int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; - unsigned uc1 = (uch)c1; - unsigned uc2 = (uch)c2; - - for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) - if (col[uc1] != col[uc2]) - return(0); - return(1); -} - -/* - - categorize - sort out character categories - == static void categorize(struct parse *p, struct re_guts *g); - */ -static void -categorize(p, g) -struct parse *p; -struct re_guts *g; -{ - cat_t *cats = g->categories; - int c; - int c2; - cat_t cat; - - /* avoid making error situations worse */ - if (p->error != 0) - return; - - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (cats[c] == 0 && isinsets(g, c)) { - cat = g->ncategories++; - cats[c] = cat; - for (c2 = c+1; c2 <= CHAR_MAX; c2++) - if (cats[c2] == 0 && samesets(g, c, c2)) - cats[c2] = cat; - } -} - -/* - dupl - emit a duplicate of a bunch of sops == static sopno dupl(struct parse *p, sopno start, sopno finish); */ @@ -1698,18 +1476,11 @@ struct re_guts *g; char *cp; sopno i; int offset; - int cs, mccs; /* avoid making error situations worse */ if (p->error != 0) return; - /* Find out if we can handle OANYOF or not */ - mccs = 0; - for (cs = 0; cs < g->ncsets; cs++) - if (g->sets[cs].multis != NULL) - mccs = 1; - /* find the longest OCHAR sequence in strip */ newlen = 0; offset = 0; @@ -1729,7 +1500,7 @@ struct re_guts *g; break; case OQUEST_: /* things that must be skipped */ case OCH_: - offset = altoffset(scan, offset, mccs); + offset = altoffset(scan, offset); scan--; do { scan += OPND(s); @@ -1797,11 +1568,6 @@ struct re_guts *g; if (offset > -1) offset++; newlen = 0; - /* And, now, if we found out we can't deal with - * it, make offset = -1. - */ - if (mccs) - offset = -1; break; default: /* Anything here makes it impossible or too hard @@ -1849,16 +1615,15 @@ struct re_guts *g; /* - altoffset - choose biggest offset among multiple choices - == static int altoffset(sop *scan, int offset, int mccs); + == static int altoffset(sop *scan, int offset); * * Compute, recursively if necessary, the largest offset among multiple * re paths. */ static int -altoffset(scan, offset, mccs) +altoffset(scan, offset) sop *scan; int offset; -int mccs; { int largest; int try; @@ -1880,7 +1645,7 @@ int mccs; break; case OQUEST_: case OCH_: - try = altoffset(scan, try, mccs); + try = altoffset(scan, try); if (try == -1) return -1; scan--; @@ -1897,8 +1662,6 @@ int mccs; scan++; break; case OANYOF: - if (mccs) - return -1; case OCHAR: case OANY: try++; diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h index 8c645ce..4678824 100644 --- a/lib/libc/regex/regex2.h +++ b/lib/libc/regex/regex2.h @@ -113,29 +113,16 @@ typedef long sopno; * The individual set therefore has both a pointer to the byte vector * and a mask to pick out the relevant bit of each byte. A hash code * simplifies testing whether two sets could be identical. - * - * This will get trickier for multicharacter collating elements. As - * preliminary hooks for dealing with such things, we also carry along - * a string of multi-character elements, and decide the size of the - * vectors at run time. */ typedef struct { uch *ptr; /* -> uch [csetsize] */ uch mask; /* bit within array */ short hash; /* hash code */ - size_t smultis; - char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */ } cset; /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ #define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (uch)(c)) #define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (uch)(c)) #define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask) -#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */ -#define MCsub(p, cs, cp) mcsub(p, cs, cp) -#define MCin(p, cs, cp) mcin(p, cs, cp) - -/* stuff for character categories */ -typedef unsigned char cat_t; /* * main compiled-expression structure @@ -158,8 +145,6 @@ struct re_guts { # define BAD 04 /* something wrong */ int nbol; /* number of ^ used */ int neol; /* number of $ used */ - int ncategories; /* how many character categories */ - cat_t *categories; /* ->catspace[-CHAR_MIN] */ char *must; /* match must contain this string */ int moffset; /* latest point at which must may be located */ int *charjump; /* Boyer-Moore char jump table */ @@ -168,8 +153,6 @@ struct re_guts { size_t nsub; /* copy of re_nsub */ int backrefs; /* does it use back references? */ sopno nplus; /* how deep does it nest +s? */ - /* catspace must be last */ - cat_t catspace[1]; /* actually [NC] */ }; /* misc utilities */ -- cgit v1.1