From 85448eebc0003309896515e0fb1aa825dd6769d4 Mon Sep 17 00:00:00 2001 From: ache Date: Sat, 6 Mar 2010 19:21:57 +0000 Subject: 1) Rewrite input processing to not exit with error on the first EILSEQ found in the input data but fallback to "binary equal" check instead. POSIX says: "The input file shall be a text file", nothing more, so the text file with illegal sequence is valid input. BTW, GNU sort does not fails on EILSEQ too. 2) Speedup input processing a bit in complex cases like skipping fields, chars or ignore case. 3) Enforce the implied LINE_MAX limit (from POSIX definition of "text file" and POSIX uniq(1) description). --- usr.bin/uniq/uniq.c | 174 ++++++++++++++++++++-------------------------------- 1 file changed, 68 insertions(+), 106 deletions(-) (limited to 'usr.bin/uniq') diff --git a/usr.bin/uniq/uniq.c b/usr.bin/uniq/uniq.c index 2b11fe4..89149dcdf 100644 --- a/usr.bin/uniq/uniq.c +++ b/usr.bin/uniq/uniq.c @@ -60,31 +60,25 @@ static const char rcsid[] = #include #include -#define INITLINELEN (LINE_MAX + 1) -#define MAXLINELEN ((SIZE_MAX / sizeof(wchar_t)) / 2) - -int cflag, dflag, uflag; +int cflag, dflag, uflag, iflag; int numchars, numfields, repeats; FILE *file(const char *, const char *); -wchar_t *getline(wchar_t *, size_t *, FILE *); -void show(FILE *, wchar_t *); +wchar_t *convert(wchar_t *, const char *); +char *getlinemax(char *, FILE *); +void show(FILE *, const char *); wchar_t *skip(wchar_t *); void obsolete(char *[]); static void usage(void); -int wcsicoll(wchar_t *, wchar_t *); int main (int argc, char *argv[]) { - wchar_t *t1, *t2; + wchar_t *tprev, *tthis, *wprev, *wthis, *wp; FILE *ifp, *ofp; - int ch, b1; - size_t prevbuflen, thisbuflen; - wchar_t *prevline, *thisline; - char *p; + int ch, comp; + char *prevline, *thisline, *p; const char *ifn; - int iflag = 0, comp; (void) setlocale(LC_ALL, ""); @@ -139,48 +133,47 @@ main (int argc, char *argv[]) if (argc > 1) ofp = file(argv[1], "w"); - prevbuflen = INITLINELEN; - thisbuflen = INITLINELEN; - prevline = malloc(prevbuflen * sizeof(*prevline)); - thisline = malloc(thisbuflen * sizeof(*thisline)); - if (prevline == NULL || thisline == NULL) + prevline = malloc(LINE_MAX); + thisline = malloc(LINE_MAX); + wprev = malloc(LINE_MAX * sizeof(*wprev)); + wthis = malloc(LINE_MAX * sizeof(*wthis)); + if (prevline == NULL || thisline == NULL || + wprev == NULL || wthis == NULL) err(1, "malloc"); - if ((prevline = getline(prevline, &prevbuflen, ifp)) == NULL) { + if ((prevline = getlinemax(prevline, ifp)) == NULL) { if (ferror(ifp)) err(1, "%s", ifn); exit(0); } + tprev = convert(wprev, prevline); + if (!cflag && uflag && dflag) show(ofp, prevline); - while ((thisline = getline(thisline, &thisbuflen, ifp)) != NULL) { - /* If requested get the chosen fields + character offsets. */ - if (numfields || numchars) { - t1 = skip(thisline); - t2 = skip(prevline); - } else { - t1 = thisline; - t2 = prevline; - } + while ((thisline = getlinemax(thisline, ifp)) != NULL) { + tthis = convert(wthis, thisline); - /* If different, print; set previous to new value. */ - if (iflag) - comp = wcsicoll(t1, t2); + if (tthis == NULL && tprev == NULL) + comp = strcmp(thisline, prevline); + else if (tthis == NULL || tprev == NULL) + comp = 1; else - comp = wcscoll(t1, t2); + comp = wcscoll(tthis, tprev); if (comp) { + /* If different, print; set previous to new value. */ if (cflag || !dflag || !uflag) show(ofp, prevline); - t1 = prevline; - b1 = prevbuflen; + p = prevline; + wp = wprev; prevline = thisline; - prevbuflen = thisbuflen; + wprev = wthis; + tprev = tthis; if (!cflag && uflag && dflag) show(ofp, prevline); - thisline = t1; - thisbuflen = b1; + thisline = p; + wthis = wp; repeats = 0; } else ++repeats; @@ -192,44 +185,61 @@ main (int argc, char *argv[]) exit(0); } -wchar_t * -getline(wchar_t *buf, size_t *buflen, FILE *fp) +char * +getlinemax(char *buf, FILE *fp) { size_t bufpos; - wint_t ch; + int ch; bufpos = 0; - while ((ch = getwc(fp)) != WEOF && ch != '\n') { - if (bufpos + 1 >= *buflen) { - *buflen = *buflen * 2; - if (*buflen > MAXLINELEN) - errx(1, - "Maximum line buffer length (%zu) exceeded", - MAXLINELEN); - buf = reallocf(buf, *buflen * sizeof(*buf)); - if (buf == NULL) - err(1, "reallocf"); - } + while ((ch = getc(fp)) != EOF && ch != '\n') { buf[bufpos++] = ch; + if (bufpos >= LINE_MAX) + errx(1, "Maximum line length (%zu) exceeded", + LINE_MAX); } buf[bufpos] = '\0'; return (bufpos != 0 || ch == '\n' ? buf : NULL); } +wchar_t * +convert(wchar_t *buf, const char *str) +{ + size_t n; + wchar_t *p, *ret; + + if ((n = mbstowcs(buf, str, LINE_MAX)) == LINE_MAX) + errx(1, "Maximum line length (%zu) exceeded", LINE_MAX); + else if (n != (size_t)-1) { + /* If requested get the chosen fields + character offsets. */ + if (numfields || numchars) + ret = skip(buf); + else + ret = buf; + if (iflag) { + for (p = ret; *p != L'\0'; p++) + *p = towlower(*p); + } + } else + ret = NULL; + + return (ret); +} + /* * show -- * Output a line depending on the flags and number of repetitions * of the line. */ void -show(FILE *ofp, wchar_t *str) +show(FILE *ofp, const char *str) { if (cflag) - (void)fprintf(ofp, "%4d %ls\n", repeats + 1, str); + (void)fprintf(ofp, "%4d %s\n", repeats + 1, str); if ((dflag && repeats) || (uflag && !repeats)) - (void)fprintf(ofp, "%ls\n", str); + (void)fprintf(ofp, "%s\n", str); } wchar_t * @@ -237,13 +247,14 @@ skip(wchar_t *str) { int nchars, nfields; - for (nfields = 0; *str != '\0' && nfields++ != numfields; ) { + for (nfields = 0; *str != L'\0' && nfields++ != numfields; ) { while (iswblank(*str)) str++; - while (*str != '\0' && !iswblank(*str)) + while (*str != L'\0' && !iswblank(*str)) str++; } - for (nchars = numchars; nchars-- && *str; ++str); + for (nchars = numchars; nchars-- && *str != L'\0'; ++str) + ; return(str); } @@ -293,52 +304,3 @@ usage(void) "usage: uniq [-c | -d | -u] [-i] [-f fields] [-s chars] [input [output]]\n"); exit(1); } - -static size_t wcsicoll_l1_buflen = 0, wcsicoll_l2_buflen = 0; -static wchar_t *wcsicoll_l1_buf = NULL, *wcsicoll_l2_buf = NULL; - -int -wcsicoll(wchar_t *s1, wchar_t *s2) -{ - wchar_t *p; - size_t l1, l2; - size_t new_l1_buflen, new_l2_buflen; - - l1 = wcslen(s1) + 1; - l2 = wcslen(s2) + 1; - new_l1_buflen = wcsicoll_l1_buflen; - new_l2_buflen = wcsicoll_l2_buflen; - while (new_l1_buflen < l1) { - if (new_l1_buflen == 0) - new_l1_buflen = INITLINELEN; - else - new_l1_buflen *= 2; - } - while (new_l2_buflen < l2) { - if (new_l2_buflen == 0) - new_l2_buflen = INITLINELEN; - else - new_l2_buflen *= 2; - } - if (new_l1_buflen > wcsicoll_l1_buflen) { - wcsicoll_l1_buf = reallocf(wcsicoll_l1_buf, new_l1_buflen * sizeof(*wcsicoll_l1_buf)); - if (wcsicoll_l1_buf == NULL) - err(1, "reallocf"); - wcsicoll_l1_buflen = new_l1_buflen; - } - if (new_l2_buflen > wcsicoll_l2_buflen) { - wcsicoll_l2_buf = reallocf(wcsicoll_l2_buf, new_l2_buflen * sizeof(*wcsicoll_l2_buf)); - if (wcsicoll_l2_buf == NULL) - err(1, "reallocf"); - wcsicoll_l2_buflen = new_l2_buflen; - } - - for (p = wcsicoll_l1_buf; *s1; s1++) - *p++ = towlower(*s1); - *p = '\0'; - for (p = wcsicoll_l2_buf; *s2; s2++) - *p++ = towlower(*s2); - *p = '\0'; - - return (wcscoll(wcsicoll_l1_buf, wcsicoll_l2_buf)); -} -- cgit v1.1