From 3aceb5c9da1dc52d43369ca35c656c2ce8a2c6a1 Mon Sep 17 00:00:00 2001 From: tjr Date: Sun, 4 Jul 2004 11:58:10 +0000 Subject: Improve case-insensitive matching in multibyte locales. Obtained from: Isamu Hasegawa (IBM) via Fedora --- gnu/usr.bin/grep/dfa.c | 20 ++++++------ gnu/usr.bin/grep/grep.c | 37 ++++++++++++++++++++++ gnu/usr.bin/grep/search.c | 80 ++++++++++++++++++++++++++++++++++++----------- 3 files changed, 109 insertions(+), 28 deletions(-) diff --git a/gnu/usr.bin/grep/dfa.c b/gnu/usr.bin/grep/dfa.c index 24a85a1..ffbb751 100644 --- a/gnu/usr.bin/grep/dfa.c +++ b/gnu/usr.bin/grep/dfa.c @@ -416,7 +416,7 @@ update_mb_len_index (unsigned char const *p, int len) /* This function fetch a wide character, and update cur_mb_len, used only if the current locale is a multibyte environment. */ -static wchar_t +static wint_t fetch_wc (char const *eoferr) { wchar_t wc; @@ -425,7 +425,7 @@ fetch_wc (char const *eoferr) if (eoferr != 0) dfaerror (eoferr); else - return -1; + return WEOF; } cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); @@ -461,7 +461,7 @@ fetch_wc (char const *eoferr) static void parse_bracket_exp_mb () { - wchar_t wc, wc1, wc2; + wint_t wc, wc1, wc2; /* Work area to build a mb_char_classes. */ struct mb_char_classes *work_mbc; @@ -498,7 +498,7 @@ parse_bracket_exp_mb () work_mbc->invert = 0; do { - wc1 = -1; /* mark wc1 is not initialized". */ + wc1 = WEOF; /* mark wc1 is not initialized". */ /* Note that if we're looking at some other [:...:] construct, we just treat it as a bunch of ordinary characters. We can do @@ -588,7 +588,7 @@ parse_bracket_exp_mb () work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; } } - wc = -1; + wc = WEOF; } else /* We treat '[' as a normal character here. */ @@ -602,7 +602,7 @@ parse_bracket_exp_mb () wc = fetch_wc(("Unbalanced [")); } - if (wc1 == -1) + if (wc1 == WEOF) wc1 = fetch_wc(_("Unbalanced [")); if (wc1 == L'-') @@ -632,17 +632,17 @@ parse_bracket_exp_mb () } REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, range_sts_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = wc; + work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, range_ends_al, work_mbc->nranges + 1); - work_mbc->range_ends[work_mbc->nranges++] = wc2; + work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; } - else if (wc != -1) + else if (wc != WEOF) /* build normal characters. */ { REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = wc; + work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; } } while ((wc = wc1) != L']'); diff --git a/gnu/usr.bin/grep/grep.c b/gnu/usr.bin/grep/grep.c index c831537..ad1dd49 100644 --- a/gnu/usr.bin/grep/grep.c +++ b/gnu/usr.bin/grep/grep.c @@ -33,6 +33,12 @@ # include # include #endif +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC +/* We can handle multibyte string. */ +# define MBS_SUPPORT +# include +# include +#endif #include #include "system.h" #include "getopt.h" @@ -1805,6 +1811,37 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")) if (!install_matcher (matcher) && !install_matcher ("default")) abort (); +#ifdef MBS_SUPPORT + if (MB_CUR_MAX != 1 && match_icase) + { + wchar_t wc; + mbstate_t cur_state, prev_state; + int i, len = strlen(keys); + + memset(&cur_state, 0, sizeof(mbstate_t)); + for (i = 0; i <= len ;) + { + size_t mbclen; + mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state); + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) + { + /* An invalid sequence, or a truncated multibyte character. + We treat it as a singlebyte character. */ + mbclen = 1; + } + else + { + if (iswupper((wint_t)wc)) + { + wc = towlower((wint_t)wc); + wcrtomb(keys + i, wc, &cur_state); + } + } + i += mbclen; + } + } +#endif /* MBS_SUPPORT */ + (*compile)(keys, keycc); if ((argc - optind > 1 && !no_filenames) || with_filenames) diff --git a/gnu/usr.bin/grep/search.c b/gnu/usr.bin/grep/search.c index 4336caf..b8a1dcd 100644 --- a/gnu/usr.bin/grep/search.c +++ b/gnu/usr.bin/grep/search.c @@ -151,15 +151,16 @@ kwsmusts (void) static char* check_multibyte_string(char const *buf, size_t size) { - char *mb_properties = malloc(size); + char *mb_properties = xmalloc(size); mbstate_t cur_state; + wchar_t wc; int i; memset(&cur_state, 0, sizeof(mbstate_t)); memset(mb_properties, 0, sizeof(char)*size); for (i = 0; i < size ;) { size_t mbclen; - mbclen = mbrlen(buf + i, size - i, &cur_state); + mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) { @@ -167,6 +168,14 @@ check_multibyte_string(char const *buf, size_t size) We treat it as a singlebyte character. */ mbclen = 1; } + else if (match_icase) + { + if (iswupper((wint_t)wc)) + { + wc = towlower((wint_t)wc); + wcrtomb(buf + i, wc, &cur_state); + } + } mb_properties[i] = mbclen; i += mbclen; } @@ -235,7 +244,7 @@ Gcompile (char const *pattern, size_t size) static char const line_end[] = "\\)$"; static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); size_t i; strcpy (n, match_lines ? line_beg : word_beg); i = strlen (n); @@ -318,7 +327,7 @@ Ecompile (char const *pattern, size_t size) static char const line_end[] = ")$"; static char const word_beg[] = "(^|[^[:alnum:]_])("; static char const word_end[] = ")([^[:alnum:]_]|$)"; - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); size_t i; strcpy (n, match_lines ? line_beg : word_beg); i = strlen(n); @@ -341,14 +350,20 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) char eol = eolbyte; int backref, start, len; struct kwsmatch kwsm; - size_t i; + size_t i, ret_val; #ifdef MBS_SUPPORT char *mb_properties = NULL; -#endif /* MBS_SUPPORT */ - -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && kwset) - mb_properties = check_multibyte_string(buf, size); + if (MB_CUR_MAX > 1) + { + if (match_icase) + { + char *case_buf = xmalloc(size); + memcpy(case_buf, buf, size); + buf = case_buf; + } + if (kwset) + mb_properties = check_multibyte_string(buf, size); + } #endif /* MBS_SUPPORT */ buflim = buf + size; @@ -457,8 +472,13 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) failure: #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties) - free (mb_properties); + if (MB_CUR_MAX > 1) + { + if (mb_properties) + free (mb_properties); + if (match_icase) + free ((char *) buf); + } #endif /* MBS_SUPPORT */ return (size_t) -1; @@ -469,8 +489,13 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) success_in_start_and_len: #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties) - free (mb_properties); + if (MB_CUR_MAX > 1) + { + if (mb_properties) + free (mb_properties); + if (match_icase) + free ((char *) buf); + } #endif /* MBS_SUPPORT */ *match_size = len; return start; @@ -506,10 +531,19 @@ Fexecute (char const *buf, size_t size, size_t *match_size, int exact) register size_t len; char eol = eolbyte; struct kwsmatch kwsmatch; + size_t ret_val; #ifdef MBS_SUPPORT - char *mb_properties; + char *mb_properties = NULL; if (MB_CUR_MAX > 1) - mb_properties = check_multibyte_string (buf, size); + { + if (match_icase) + { + char *case_buf = xmalloc(size); + memcpy(case_buf, buf, size); + buf = case_buf; + } + mb_properties = check_multibyte_string(buf, size); + } #endif /* MBS_SUPPORT */ for (beg = buf; beg <= buf + size; ++beg) @@ -567,7 +601,12 @@ Fexecute (char const *buf, size_t size, size_t *match_size, int exact) failure: #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) - free (mb_properties); + { + if (match_icase) + free((char *) buf); + if (mb_properties) + free(mb_properties); + } #endif /* MBS_SUPPORT */ return -1; @@ -583,7 +622,12 @@ Fexecute (char const *buf, size_t size, size_t *match_size, int exact) *match_size = len; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) - free (mb_properties); + { + if (mb_properties) + free (mb_properties); + if (match_icase) + free ((char *) buf); + } #endif /* MBS_SUPPORT */ return beg - buf; } -- cgit v1.1