--- libxklavier/utf8.c 1970-01-01 03:00:00.000000000 +0300 +++ libxklavier/utf8.c 2015-11-18 01:50:50.406110000 +0300 @@ -0,0 +1,61 @@ +/* + * The utf8_check() function scans the '\0'-terminated string starting + * at s. It returns a pointer to the first byte of the first malformed + * or overlong UTF-8 sequence found, or NULL if the string contains + * only correct UTF-8. It also spots UTF-8 sequences that could cause + * trouble if converted to UTF-16, namely surrogate characters + * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This + * routine is very likely to find a malformed sequence if the input + * uses any other encoding than UTF-8. It therefore can be used as a + * very effective heuristic for distinguishing between UTF-8 and other + * encodings. + * + * I wrote this code mainly as a specification of functionality; there + * are no doubt performance optimizations possible for certain CPUs. + * + * Markus Kuhn -- 2005-03-30 + * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html + */ + + +guchar * +utf8_check(guchar *s) +{ + while (*s) { + if (*s < 0x80) + /* 0xxxxxxx */ + s++; + else if ((s[0] & 0xe0) == 0xc0) { + /* 110XXXXx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[0] & 0xfe) == 0xc0) /* overlong? */ + return s; + else + s += 2; + } else if ((s[0] & 0xf0) == 0xe0) { + /* 1110XXXX 10Xxxxxx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */ + (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */ + (s[0] == 0xef && s[1] == 0xbf && + (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */ + return s; + else + s += 3; + } else if ((s[0] & 0xf8) == 0xf0) { + /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */ + (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */ + return s; + else + s += 4; + } else + return s; + } + + return NULL; +}