summaryrefslogtreecommitdiffstats
path: root/contrib/nvi/common/encoding.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/nvi/common/encoding.c')
-rw-r--r--contrib/nvi/common/encoding.c230
1 files changed, 230 insertions, 0 deletions
diff --git a/contrib/nvi/common/encoding.c b/contrib/nvi/common/encoding.c
new file mode 100644
index 0000000..6de509e
--- /dev/null
+++ b/contrib/nvi/common/encoding.c
@@ -0,0 +1,230 @@
+/*-
+ * Copyright (c) 2011, 2012
+ * Zhihao Yuan. All rights reserved.
+ *
+ * See the LICENSE file for redistribution information.
+ */
+
+#ifndef lint
+static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $";
+#endif /* not lint */
+
+#include <sys/types.h>
+
+int looks_utf8 __P((const char *, size_t));
+int looks_utf16 __P((const char *, size_t));
+int decode_utf8 __P((const char *));
+int decode_utf16 __P((const char *, int));
+
+#define F 0 /* character never appears in text */
+#define T 1 /* character appears in plain ASCII text */
+#define I 2 /* character appears in ISO-8859 text */
+#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
+
+static char text_chars[256] = {
+ /* BEL BS HT LF FF CR */
+ F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
+ /* ESC */
+ F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
+ /* NEL */
+ X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
+ X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
+};
+
+/*
+ * looks_utf8 --
+ * Decide whether some text looks like UTF-8. Returns:
+ *
+ * -1: invalid UTF-8
+ * 0: uses odd control characters, so doesn't look like text
+ * 1: 7-bit text
+ * 2: definitely UTF-8 text (valid high-bit set bytes)
+ *
+ * Based on RFC 3629. UTF-8 with BOM is not accepted.
+ *
+ * PUBLIC: int looks_utf8 __P((const char *, size_t));
+ */
+int
+looks_utf8(const char *ibuf, size_t nbytes)
+{
+ const u_char *buf = (u_char *)ibuf;
+ size_t i;
+ int n;
+ int gotone = 0, ctrl = 0;
+
+ for (i = 0; i < nbytes; i++) {
+ if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
+ /*
+ * Even if the whole file is valid UTF-8 sequences,
+ * still reject it if it uses weird control characters.
+ */
+
+ if (text_chars[buf[i]] != T)
+ ctrl = 1;
+ } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
+ return -1;
+ } else { /* 11xxxxxx begins UTF-8 */
+ int following;
+
+ if ((buf[i] & 0x20) == 0) /* 110xxxxx */
+ if (buf[i] > 0xC1) /* C0, C1 */
+ following = 1;
+ else return -1;
+ else if ((buf[i] & 0x10) == 0) /* 1110xxxx */
+ following = 2;
+ else if ((buf[i] & 0x08) == 0) /* 11110xxx */
+ if (buf[i] < 0xF5)
+ following = 3;
+ else return -1; /* F5, F6, F7 */
+ else
+ return -1; /* F8~FF */
+
+ for (n = 0; n < following; n++) {
+ i++;
+ if (i >= nbytes)
+ goto done;
+
+ if (buf[i] & 0x40) /* 10xxxxxx */
+ return -1;
+ }
+
+ gotone = 1;
+ }
+ }
+done:
+ return ctrl ? 0 : (gotone ? 2 : 1);
+}
+
+/*
+ * looks_utf16 --
+ * Decide whether some text looks like UTF-16. Returns:
+ *
+ * 0: invalid UTF-16
+ * 1: Little-endian UTF-16
+ * 2: Big-endian UTF-16
+ *
+ * PUBLIC: int looks_utf16 __P((const char *, size_t));
+ */
+int
+looks_utf16(const char *ibuf, size_t nbytes)
+{
+ const u_char *buf = (u_char *)ibuf;
+ int bigend;
+ size_t i;
+ unsigned int c;
+ int bom;
+ int following = 0;
+
+ if (nbytes < 2)
+ return 0;
+
+ bom = buf[0] << 8 ^ buf[1];
+ if (bom == 0xFFFE)
+ bigend = 0;
+ else if (bom == 0xFEFF)
+ bigend = 1;
+ else
+ return 0;
+
+ for (i = 2; i + 1 < nbytes; i += 2) {
+ if (bigend)
+ c = buf[i] << 8 ^ buf[i + 1];
+ else
+ c = buf[i] ^ buf[i + 1] << 8;
+
+ if (!following)
+ if (c < 0xD800 || c > 0xDFFF)
+ if (c < 128 && text_chars[c] != T)
+ return 0;
+ else
+ following = 0;
+ else if (c > 0xDBFF)
+ return 0;
+ else {
+ following = 1;
+ continue;
+ }
+ else if (c < 0xDC00 || c > 0xDFFF)
+ return 0;
+ }
+
+ return 1 + bigend;
+}
+
+#undef F
+#undef T
+#undef I
+#undef X
+
+/*
+ * decode_utf8 --
+ * Decode a UTF-8 character from byte string to Unicode.
+ * Returns -1 if the first byte is a not UTF-8 leader.
+ *
+ * Based on RFC 3629, but without error detection.
+ *
+ * PUBLIC: int decode_utf8 __P((const char *));
+ */
+int decode_utf8(const char *ibuf) {
+ const u_char *buf = (u_char *)ibuf;
+ int u = -1;
+
+ if ((buf[0] & 0x80) == 0)
+ u = buf[0];
+ else if ((buf[0] & 0x40) == 0);
+ else {
+ if ((buf[0] & 0x20) == 0)
+ u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80);
+ else if ((buf[0] & 0x10) == 0)
+ u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6
+ ^ (buf[2] ^ 0x80);
+ else if (((buf[0] & 0x08) == 0))
+ u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
+ ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80);
+ }
+ return u;
+}
+
+/*
+ * decode_utf16 --
+ * Decode a UTF-16 character from byte string to Unicode.
+ * Returns -1 if the first unsigned integer is invalid.
+ *
+ * No error detection on supplementary bytes.
+ *
+ * PUBLIC: int decode_utf16 __P((const char *, int));
+ */
+int decode_utf16(const char* ibuf, int bigend) {
+ const u_char *buf = (u_char *)ibuf;
+ int u = -1;
+ unsigned int w1, w2;
+
+ if (bigend)
+ w1 = buf[0] << 8 ^ buf[1];
+ else
+ w1 = buf[0] ^ buf[1] << 8;
+
+ if (w1 < 0xD800 || w1 > 0xDFFF)
+ u = w1;
+ else if (w1 > 0xDBFF);
+ else {
+ if (bigend)
+ w2 = buf[2] << 8 ^ buf[3];
+ else
+ w2 = buf[2] ^ buf[3] << 8;
+ u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
+ }
+ return u;
+}
OpenPOWER on IntegriCloud