Import trimmed svn-1.8.0-rc3

author: peter <peter@FreeBSD.org> 2013-06-18 02:07:41 +0000
committer: peter <peter@FreeBSD.org> 2013-06-18 02:07:41 +0000
commit: d25dac7fcc6acc838b71bbda8916fd9665c709ab (patch)
tree: 135691142dc0e75a5e5d97b5074d03436435b8e0 /subversion/libsvn_subr/utf_validate.c
download: FreeBSD-src-d25dac7fcc6acc838b71bbda8916fd9665c709ab.zip
FreeBSD-src-d25dac7fcc6acc838b71bbda8916fd9665c709ab.tar.gz
1 files changed, 485 insertions, 0 deletions
diff --git a/subversion/libsvn_subr/utf_validate.c b/subversion/libsvn_subr/utf_validate.c
new file mode 100644
index 0000000..8311fd7
--- /dev/null
+++ b/subversion/libsvn_subr/utf_validate.c
@@ -0,0 +1,485 @@
+/*
+ * utf_validate.c:  Validate a UTF-8 string
+ *
+ * ====================================================================
+ *    Licensed to the Apache Software Foundation (ASF) under one
+ *    or more contributor license agreements.  See the NOTICE file
+ *    distributed with this work for additional information
+ *    regarding copyright ownership.  The ASF licenses this file
+ *    to you under the Apache License, Version 2.0 (the
+ *    "License"); you may not use this file except in compliance
+ *    with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing,
+ *    software distributed under the License is distributed on an
+ *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *    KIND, either express or implied.  See the License for the
+ *    specific language governing permissions and limitations
+ *    under the License.
+ * ====================================================================
+ */
+
+/* Validate a UTF-8 string according to the rules in
+ *
+ *    Table 3-6. Well-Formed UTF-8 Bytes Sequences
+ *
+ * in
+ *
+ *    The Unicode Standard, Version 4.0
+ *
+ * which is available at
+ *
+ *    http://www.unicode.org/
+ *
+ * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
+ * is a subset of that enconding.  The Unicode enconding prohibits things
+ * like non-shortest encodings (some characters can be represented by more
+ * than one multi-byte encoding) and the encodings for the surrogate code
+ * points.  RFC-3629 superceeds RFC-2279 and adopts the same well-formed
+ * rules as Unicode.  This is the ABNF in RFC-3629 that describes
+ * well-formed UTF-8 rules:
+ *
+ *   UTF8-octets = *( UTF8-char )
+ *   UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+ *   UTF8-1      = %x00-7F
+ *   UTF8-2      = %xC2-DF UTF8-tail
+ *   UTF8-3      = %xE0 %xA0-BF UTF8-tail /
+ *                 %xE1-EC 2( UTF8-tail ) /
+ *                 %xED %x80-9F UTF8-tail /
+ *                 %xEE-EF 2( UTF8-tail )
+ *   UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
+ *                 %xF1-F3 3( UTF8-tail ) /
+ *                 %xF4 %x80-8F 2( UTF8-tail )
+ *   UTF8-tail   = %x80-BF
+ *
+ */
+
+#include "private/svn_utf_private.h"
+#include "private/svn_eol_private.h"
+#include "private/svn_dep_compat.h"
+
+/* Lookup table to categorise each octet in the string. */
+static const char octet_category[256] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0x00-0x7f */
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, /* 0x80-0x8f */
+  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, /* 0x90-0x9f */
+  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, /* 0xa0-0xbf */
+  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+  4,  4,                                                         /* 0xc0-0xc1 */
+          5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, /* 0xc2-0xdf */
+  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+  6,                                                             /* 0xe0 */
+      7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,             /* 0xe1-0xec */
+                                                      8,         /* 0xed */
+                                                          9,  9, /* 0xee-0xef */
+  10,                                                            /* 0xf0 */
+      11, 11, 11,                                                /* 0xf1-0xf3 */
+                  12,                                            /* 0xf4 */
+                      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
+};
+
+/* Machine states */
+#define FSM_START         0
+#define FSM_80BF          1
+#define FSM_A0BF          2
+#define FSM_80BF80BF      3
+#define FSM_809F          4
+#define FSM_90BF          5
+#define FSM_80BF80BF80BF  6
+#define FSM_808F          7
+#define FSM_ERROR         8
+
+/* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the
+   same transitions, as do categories 0xe1-0xec and 0xee-0xef.  I wonder if
+   there is any great benefit in combining categories?  It would reduce the
+   memory footprint of the transition table by 16 bytes, but might it be
+   harder to understand?  */
+
+/* Machine transition table */
+static const char machine [9][14] = {
+  /* FSM_START */
+  {FSM_START,         /* 0x00-0x7f */
+   FSM_ERROR,         /* 0x80-0x8f */
+   FSM_ERROR,         /* 0x90-0x9f */
+   FSM_ERROR,         /* 0xa0-0xbf */
+   FSM_ERROR,         /* 0xc0-0xc1 */
+   FSM_80BF,          /* 0xc2-0xdf */
+   FSM_A0BF,          /* 0xe0 */
+   FSM_80BF80BF,      /* 0xe1-0xec */
+   FSM_809F,          /* 0xed */
+   FSM_80BF80BF,      /* 0xee-0xef */
+   FSM_90BF,          /* 0xf0 */
+   FSM_80BF80BF80BF,  /* 0xf1-0xf3 */
+   FSM_808F,          /* 0xf4 */
+   FSM_ERROR},        /* 0xf5-0xff */
+
+  /* FSM_80BF */
+  {FSM_ERROR,         /* 0x00-0x7f */
+   FSM_START,         /* 0x80-0x8f */
+   FSM_START,         /* 0x90-0x9f */
+   FSM_START,         /* 0xa0-0xbf */
+   FSM_ERROR,         /* 0xc0-0xc1 */
+   FSM_ERROR,         /* 0xc2-0xdf */
+   FSM_ERROR,         /* 0xe0 */
+   FSM_ERROR,         /* 0xe1-0xec */
+   FSM_ERROR,         /* 0xed */
+   FSM_ERROR,         /* 0xee-0xef */
+   FSM_ERROR,         /* 0xf0 */
+   FSM_ERROR,         /* 0xf1-0xf3 */
+   FSM_ERROR,         /* 0xf4 */
+   FSM_ERROR},        /* 0xf5-0xff */
+
+  /* FSM_A0BF */
+  {FSM_ERROR,         /* 0x00-0x7f */
+   FSM_ERROR,         /* 0x80-0x8f */
+   FSM_ERROR,         /* 0x90-0x9f */
+   FSM_80BF,          /* 0xa0-0xbf */
+   FSM_ERROR,         /* 0xc0-0xc1 */
+   FSM_ERROR,         /* 0xc2-0xdf */
+   FSM_ERROR,         /* 0xe0 */
+   FSM_ERROR,         /* 0xe1-0xec */
+   FSM_ERROR,         /* 0xed */
+   FSM_ERROR,         /* 0xee-0xef */
+   FSM_ERROR,         /* 0xf0 */
+   FSM_ERROR,         /* 0xf1-0xf3 */
+   FSM_ERROR,         /* 0xf4 */
+   FSM_ERROR},        /* 0xf5-0xff */
+
+  /* FSM_80BF80BF */
+  {FSM_ERROR,         /* 0x00-0x7f */
+   FSM_80BF,          /* 0x80-0x8f */
+   FSM_80BF,          /* 0x90-0x9f */
+   FSM_80BF,          /* 0xa0-0xbf */
+   FSM_ERROR,         /* 0xc0-0xc1 */
+   FSM_ERROR,         /* 0xc2-0xdf */
+   FSM_ERROR,         /* 0xe0 */
+   FSM_ERROR,         /* 0xe1-0xec */
+   FSM_ERROR,         /* 0xed */
+   FSM_ERROR,         /* 0xee-0xef */
+   FSM_ERROR,         /* 0xf0 */
+   FSM_ERROR,         /* 0xf1-0xf3 */
+   FSM_ERROR,         /* 0xf4 */
+   FSM_ERROR},        /* 0xf5-0xff */
+
+  /* FSM_809F */
+  {FSM_ERROR,         /* 0x00-0x7f */
+   FSM_80BF,          /* 0x80-0x8f */
+   FSM_80BF,          /* 0x90-0x9f */
+   FSM_ERROR,         /* 0xa0-0xbf */
+   FSM_ERROR,         /* 0xc0-0xc1 */
+   FSM_ERROR,         /* 0xc2-0xdf */
+   FSM_ERROR,         /* 0xe0 */
+   FSM_ERROR,         /* 0xe1-0xec */
+   FSM_ERROR,         /* 0xed */
+   FSM_ERROR,         /* 0xee-0xef */
+   FSM_ERROR,         /* 0xf0 */
+   FSM_ERROR,         /* 0xf1-0xf3 */
+   FSM_ERROR,         /* 0xf4 */
+   FSM_ERROR},        /* 0xf5-0xff */
+
+  /* FSM_90BF */
+  {FSM_ERROR,         /* 0x00-0x7f */
+   FSM_ERROR,         /* 0x80-0x8f */
+   FSM_80BF80BF,      /* 0x90-0x9f */
+   FSM_80BF80BF,      /* 0xa0-0xbf */
+   FSM_ERROR,         /* 0xc0-0xc1 */
+   FSM_ERROR,         /* 0xc2-0xdf */
+   FSM_ERROR,         /* 0xe0 */
+   FSM_ERROR,         /* 0xe1-0xec */
+   FSM_ERROR,         /* 0xed */
+   FSM_ERROR,         /* 0xee-0xef */
+   FSM_ERROR,         /* 0xf0 */
+   FSM_ERROR,         /* 0xf1-0xf3 */
+   FSM_ERROR,         /* 0xf4 */
+   FSM_ERROR},        /* 0xf5-0xff */
+
+  /* FSM_80BF80BF80BF */
+  {FSM_ERROR,         /* 0x00-0x7f */
+   FSM_80BF80BF,      /* 0x80-0x8f */
+   FSM_80BF80BF,      /* 0x90-0x9f */
+   FSM_80BF80BF,      /* 0xa0-0xbf */
+   FSM_ERROR,         /* 0xc0-0xc1 */
+   FSM_ERROR,         /* 0xc2-0xdf */
+   FSM_ERROR,         /* 0xe0 */
+   FSM_ERROR,         /* 0xe1-0xec */
+   FSM_ERROR,         /* 0xed */
+   FSM_ERROR,         /* 0xee-0xef */
+   FSM_ERROR,         /* 0xf0 */
+   FSM_ERROR,         /* 0xf1-0xf3 */
+   FSM_ERROR,         /* 0xf4 */
+   FSM_ERROR},        /* 0xf5-0xff */
+
+  /* FSM_808F */
+  {FSM_ERROR,         /* 0x00-0x7f */
+   FSM_80BF80BF,      /* 0x80-0x8f */
+   FSM_ERROR,         /* 0x90-0x9f */
+   FSM_ERROR,         /* 0xa0-0xbf */
+   FSM_ERROR,         /* 0xc0-0xc1 */
+   FSM_ERROR,         /* 0xc2-0xdf */
+   FSM_ERROR,         /* 0xe0 */
+   FSM_ERROR,         /* 0xe1-0xec */
+   FSM_ERROR,         /* 0xed */
+   FSM_ERROR,         /* 0xee-0xef */
+   FSM_ERROR,         /* 0xf0 */
+   FSM_ERROR,         /* 0xf1-0xf3 */
+   FSM_ERROR,         /* 0xf4 */
+   FSM_ERROR},        /* 0xf5-0xff */
+
+  /* FSM_ERROR */
+  {FSM_ERROR,         /* 0x00-0x7f */
+   FSM_ERROR,         /* 0x80-0x8f */
+   FSM_ERROR,         /* 0x90-0x9f */
+   FSM_ERROR,         /* 0xa0-0xbf */
+   FSM_ERROR,         /* 0xc0-0xc1 */
+   FSM_ERROR,         /* 0xc2-0xdf */
+   FSM_ERROR,         /* 0xe0 */
+   FSM_ERROR,         /* 0xe1-0xec */
+   FSM_ERROR,         /* 0xed */
+   FSM_ERROR,         /* 0xee-0xef */
+   FSM_ERROR,         /* 0xf0 */
+   FSM_ERROR,         /* 0xf1-0xf3 */
+   FSM_ERROR,         /* 0xf4 */
+   FSM_ERROR},        /* 0xf5-0xff */
+};
+
+/* Scan MAX_LEN bytes in *DATA for chars that are not in the octet
+ * category 0 (FSM_START).  Return the position of the first such char
+ * or DATA + MAX_LEN if all were cat 0.
+ */
+static const char *
+first_non_fsm_start_char(const char *data, apr_size_t max_len)
+{
+#if !SVN_UNALIGNED_ACCESS_IS_OK
+
+  /* On some systems, we need to make sure that buf is properly aligned
+   * for chunky data access.
+   */
+  if ((apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1))
+    {
+      apr_size_t len = (~(apr_uintptr_t)data) & (sizeof(apr_uintptr_t)-1);
+      if (len > max_len)
+        len = max_len;
+      max_len -= len;
+
+      for (; len > 0; ++data, --len)
+        if (*data < 0 || *data >= 0x80)
+          return data;
+    }
+
+#endif
+
+  /* Scan the input one machine word at a time. */
+  for (; max_len > sizeof(apr_uintptr_t)
+       ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t))
+    if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET)
+      break;
+
+  /* The remaining odd bytes will be examined the naive way: */
+  for (; max_len > 0; ++data, --max_len)
+    if (*data < 0 || *data >= 0x80)
+      break;
+
+  return data;
+}
+
+/* Scan the C string in *DATA for chars that are not in the octet
+ * category 0 (FSM_START).  Return the position of either the such
+ * char or of the terminating NUL.
+ */
+static const char *
+first_non_fsm_start_char_cstring(const char *data)
+{
+  /* We need to make sure that BUF is properly aligned for chunky data
+   * access because we don't know the string's length. Unaligned chunk
+   * read access beyond the NUL terminator could therefore result in a
+   * segfault.
+   */
+  for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data)
+    if (*data <= 0 || *data >= 0x80)
+      return data;
+
+  /* Scan the input one machine word at a time. */
+#ifndef SVN_UTF_NO_UNINITIALISED_ACCESS
+  /* This may read allocated but initialised bytes beyond the
+     terminating null.  Any such bytes are always readable and this
+     code operates correctly whatever the uninitialised values happen
+     to be.  However memory checking tools such as valgrind and GCC
+     4.8's address santitizer will object so this bit of code can be
+     disabled at compile time. */
+  for (; ; data += sizeof(apr_uintptr_t))
+    {
+      /* Check for non-ASCII chars: */
+      apr_uintptr_t chunk = *(const apr_uintptr_t *)data;
+      if (chunk & SVN__BIT_7_SET)
+        break;
+
+      /* This is the well-known strlen test: */
+      chunk |= (chunk & SVN__LOWER_7BITS_SET) + SVN__LOWER_7BITS_SET;
+      if ((chunk & SVN__BIT_7_SET) != SVN__BIT_7_SET)
+        break;
+    }
+#endif
+
+  /* The remaining odd bytes will be examined the naive way: */
+  for (; ; ++data)
+    if (*data <= 0 || *data >= 0x80)
+      break;
+
+  return data;
+}
+
+const char *
+svn_utf__last_valid(const char *data, apr_size_t len)
+{
+  const char *start = first_non_fsm_start_char(data, len);
+  const char *end = data + len;
+  int state = FSM_START;
+
+  data = start;
+  while (data < end)
+    {
+      unsigned char octet = *data++;
+      int category = octet_category[octet];
+      state = machine[state][category];
+      if (state == FSM_START)
+        start = data;
+    }
+  return start;
+}
+
+svn_boolean_t
+svn_utf__cstring_is_valid(const char *data)
+{
+  int state = FSM_START;
+
+  if (!data)
+    return FALSE;
+
+  data = first_non_fsm_start_char_cstring(data);
+
+  while (*data)
+    {
+      unsigned char octet = *data++;
+      int category = octet_category[octet];
+      state = machine[state][category];
+    }
+  return state == FSM_START;
+}
+
+svn_boolean_t
+svn_utf__is_valid(const char *data, apr_size_t len)
+{
+  const char *end = data + len;
+  int state = FSM_START;
+
+  if (!data)
+    return FALSE;
+
+  data = first_non_fsm_start_char(data, len);
+
+  while (data < end)
+    {
+      unsigned char octet = *data++;
+      int category = octet_category[octet];
+      state = machine[state][category];
+    }
+  return state == FSM_START;
+}
+
+const char *
+svn_utf__last_valid2(const char *data, apr_size_t len)
+{
+  const char *start = first_non_fsm_start_char(data, len);
+  const char *end = data + len;
+  int state = FSM_START;
+
+  data = start;
+  while (data < end)
+    {
+      unsigned char octet = *data++;
+      switch (state)
+        {
+        case FSM_START:
+          if (octet <= 0x7F)
+            break;
+          else if (octet <= 0xC1)
+            state = FSM_ERROR;
+          else if (octet <= 0xDF)
+            state = FSM_80BF;
+          else if (octet == 0xE0)
+            state = FSM_A0BF;
+          else if (octet <= 0xEC)
+            state = FSM_80BF80BF;
+          else if (octet == 0xED)
+            state = FSM_809F;
+          else if (octet <= 0xEF)
+            state = FSM_80BF80BF;
+          else if (octet == 0xF0)
+            state = FSM_90BF;
+          else if (octet <= 0xF3)
+            state = FSM_80BF80BF80BF;
+          else if (octet <= 0xF4)
+            state = FSM_808F;
+          else
+            state = FSM_ERROR;
+          break;
+        case FSM_80BF:
+          if (octet >= 0x80 && octet <= 0xBF)
+            state = FSM_START;
+          else
+            state = FSM_ERROR;
+          break;
+        case FSM_A0BF:
+          if (octet >= 0xA0 && octet <= 0xBF)
+            state = FSM_80BF;
+          else
+            state = FSM_ERROR;
+          break;
+        case FSM_80BF80BF:
+          if (octet >= 0x80 && octet <= 0xBF)
+            state = FSM_80BF;
+          else
+            state = FSM_ERROR;
+          break;
+        case FSM_809F:
+          if (octet >= 0x80 && octet <= 0x9F)
+            state = FSM_80BF;
+          else
+            state = FSM_ERROR;
+          break;
+        case FSM_90BF:
+          if (octet >= 0x90 && octet <= 0xBF)
+            state = FSM_80BF80BF;
+          else
+            state = FSM_ERROR;
+          break;
+        case FSM_80BF80BF80BF:
+          if (octet >= 0x80 && octet <= 0xBF)
+            state = FSM_80BF80BF;
+          else
+            state = FSM_ERROR;
+          break;
+        case FSM_808F:
+          if (octet >= 0x80 && octet <= 0x8F)
+            state = FSM_80BF80BF;
+          else
+            state = FSM_ERROR;
+          break;
+        default:
+        case FSM_ERROR:
+          return start;
+        }
+      if (state == FSM_START)
+        start = data;
+    }
+  return start;
+}
author	peter <peter@FreeBSD.org>	2013-06-18 02:07:41 +0000
committer	peter <peter@FreeBSD.org>	2013-06-18 02:07:41 +0000
commit	d25dac7fcc6acc838b71bbda8916fd9665c709ab (patch)
tree	135691142dc0e75a5e5d97b5074d03436435b8e0 /subversion/libsvn_subr/utf_validate.c
download	FreeBSD-src-d25dac7fcc6acc838b71bbda8916fd9665c709ab.zip FreeBSD-src-d25dac7fcc6acc838b71bbda8916fd9665c709ab.tar.gz