1 files changed, 246 insertions, 148 deletions
diff --git a/contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp b/contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp
index 2c96c4d..70183fd 100644
--- a/contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp
+++ b/contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp
@@ -16,8 +16,8 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Basic/TargetInfo.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ErrorHandling.h"
 using namespace clang;
 
 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
@@ -29,12 +29,31 @@ static int HexDigitValue(char C) {
   return -1;
 }
 
+static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
+  switch (kind) {
+  default: llvm_unreachable("Unknown token type!");
+  case tok::char_constant:
+  case tok::string_literal:
+  case tok::utf8_string_literal:
+    return Target.getCharWidth();
+  case tok::wide_char_constant:
+  case tok::wide_string_literal:
+    return Target.getWCharWidth();
+  case tok::utf16_char_constant:
+  case tok::utf16_string_literal:
+    return Target.getChar16Width();
+  case tok::utf32_char_constant:
+  case tok::utf32_string_literal:
+    return Target.getChar32Width();
+  }
+}
+
 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
 /// either a character or a string literal.
 static unsigned ProcessCharEscape(const char *&ThisTokBuf,
                                   const char *ThisTokEnd, bool &HadError,
-                                  FullSourceLoc Loc, bool IsWide,
-                                  Diagnostic *Diags, const TargetInfo &Target) {
+                                  FullSourceLoc Loc, unsigned CharWidth,
+                                  DiagnosticsEngine *Diags) {
   // Skip the '\' char.
   ++ThisTokBuf;
 
@@ -99,9 +118,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
     }
 
     // See if any bits will be truncated when evaluated as a character.
-    unsigned CharWidth =
-      IsWide ? Target.getWCharWidth() : Target.getCharWidth();
-
     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
       Overflow = true;
       ResultChar &= ~0U >> (32-CharWidth);
@@ -129,9 +145,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
 
     // Check for overflow.  Reject '\777', but not L'\777'.
-    unsigned CharWidth =
-      IsWide ? Target.getWCharWidth() : Target.getCharWidth();
-
     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
       if (Diags)
         Diags->Report(Loc, diag::warn_octal_escape_too_large);
@@ -167,7 +180,7 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
 /// return the UTF32.
 static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
                              uint32_t &UcnVal, unsigned short &UcnLen,
-                             FullSourceLoc Loc, Diagnostic *Diags, 
+                             FullSourceLoc Loc, DiagnosticsEngine *Diags, 
                              const LangOptions &Features) {
   if (!Features.CPlusPlus && !Features.C99 && Diags)
     Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
@@ -220,7 +233,8 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
 /// we will likely rework our support for UCN's.
 static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
                             char *&ResultBuf, bool &HadError,
-                            FullSourceLoc Loc, bool wide, Diagnostic *Diags, 
+                            FullSourceLoc Loc, unsigned CharByteWidth,
+                            DiagnosticsEngine *Diags,
                             const LangOptions &Features) {
   typedef uint32_t UTF32;
   UTF32 UcnVal = 0;
@@ -231,19 +245,22 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
     return;
   }
 
-  if (wide) {
-    (void)UcnLen;
-    assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
+  assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
+         "only character widths of 1, 2, or 4 bytes supported");
 
-    if (!Features.ShortWChar) {
-      // Note: our internal rep of wide char tokens is always little-endian.
-      *ResultBuf++ = (UcnVal & 0x000000FF);
-      *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
-      *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
-      *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
-      return;
-    }
+  (void)UcnLen;
+  assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
 
+  if (CharByteWidth == 4) {
+    // Note: our internal rep of wide char tokens is always little-endian.
+    *ResultBuf++ = (UcnVal & 0x000000FF);
+    *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
+    *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
+    *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
+    return;
+  }
+
+  if (CharByteWidth == 2) {
     // Convert to UTF16.
     if (UcnVal < (UTF32)0xFFFF) {
       *ResultBuf++ = (UcnVal & 0x000000FF);
@@ -262,6 +279,9 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
     *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
     return;
   }
+
+  assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
+
   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
   // The conversion below was inspired by:
   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
@@ -371,7 +391,7 @@ NumericLiteralParser(const char *begin, const char *end,
       // Done.
     } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
-              diag::err_invalid_decimal_digit) << llvm::StringRef(s, 1);
+              diag::err_invalid_decimal_digit) << StringRef(s, 1);
       hadError = true;
       return;
     } else if (*s == '.') {
@@ -434,7 +454,7 @@ NumericLiteralParser(const char *begin, const char *end,
       continue;  // Success.
     case 'i':
     case 'I':
-      if (PP.getLangOptions().Microsoft) {
+      if (PP.getLangOptions().MicrosoftExt) {
         if (isFPConstant || isLong || isLongLong) break;
 
         // Allow i8, i16, i32, i64, and i128.
@@ -498,7 +518,7 @@ NumericLiteralParser(const char *begin, const char *end,
     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
             isFPConstant ? diag::err_invalid_suffix_float_constant :
                            diag::err_invalid_suffix_integer_constant)
-      << llvm::StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
+      << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
     hadError = true;
     return;
   }
@@ -528,7 +548,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
     }
     // A binary exponent can appear with or with a '.'. If dotted, the
     // binary exponent is required.
-    if ((*s == 'p' || *s == 'P') && !PP.getLangOptions().CPlusPlus0x) {
+    if (*s == 'p' || *s == 'P') {
       const char *Exponent = s;
       s++;
       saw_exponent = true;
@@ -542,12 +562,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
       }
       s = first_non_digit;
 
-      // In C++0x, we cannot support hexadecmial floating literals because
-      // they conflict with user-defined literals, so we warn in previous
-      // versions of C++ by default.
-      if (PP.getLangOptions().CPlusPlus)
-        PP.Diag(TokLoc, diag::ext_hexconstant_cplusplus);
-      else if (!PP.getLangOptions().HexFloats)
+      if (!PP.getLangOptions().HexFloats)
         PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
     } else if (saw_period) {
       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
@@ -569,7 +584,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
       // Done.
     } else if (isxdigit(*s)) {
       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
-              diag::err_invalid_binary_digit) << llvm::StringRef(s, 1);
+              diag::err_invalid_binary_digit) << StringRef(s, 1);
       hadError = true;
     }
     // Other suffixes will be diagnosed by the caller.
@@ -599,7 +614,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
   // the code is using an incorrect base.
   if (isxdigit(*s) && *s != 'e' && *s != 'E') {
     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
-            diag::err_invalid_octal_digit) << llvm::StringRef(s, 1);
+            diag::err_invalid_octal_digit) << StringRef(s, 1);
     hadError = true;
     return;
   }
@@ -688,7 +703,6 @@ bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
 llvm::APFloat::opStatus
 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
   using llvm::APFloat;
-  using llvm::StringRef;
 
   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
   return Result.convertFromString(StringRef(ThisTokBegin, n),
@@ -696,14 +710,51 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
 }
 
 
+///       character-literal: [C++0x lex.ccon]
+///         ' c-char-sequence '
+///         u' c-char-sequence '
+///         U' c-char-sequence '
+///         L' c-char-sequence '
+///       c-char-sequence:
+///         c-char
+///         c-char-sequence c-char
+///       c-char:
+///         any member of the source character set except the single-quote ',
+///           backslash \, or new-line character
+///         escape-sequence
+///         universal-character-name
+///       escape-sequence: [C++0x lex.ccon]
+///         simple-escape-sequence
+///         octal-escape-sequence
+///         hexadecimal-escape-sequence
+///       simple-escape-sequence:
+///         one of \' \" \? \\ \a \b \f \n \r \t \v
+///       octal-escape-sequence:
+///         \ octal-digit
+///         \ octal-digit octal-digit
+///         \ octal-digit octal-digit octal-digit
+///       hexadecimal-escape-sequence:
+///         \x hexadecimal-digit
+///         hexadecimal-escape-sequence hexadecimal-digit
+///       universal-character-name:
+///         \u hex-quad
+///         \U hex-quad hex-quad
+///       hex-quad:
+///         hex-digit hex-digit hex-digit hex-digit
+///
 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
-                                     SourceLocation Loc, Preprocessor &PP) {
+                                     SourceLocation Loc, Preprocessor &PP,
+                                     tok::TokenKind kind) {
   // At this point we know that the character matches the regex "L?'.*'".
   HadError = false;
 
-  // Determine if this is a wide character.
-  IsWide = begin[0] == 'L';
-  if (IsWide) ++begin;
+  Kind = kind;
+
+  // Determine if this is a wide or UTF character.
+  if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
+      Kind == tok::utf32_char_constant) {
+    ++begin;
+  }
 
   // Skip over the entry quote.
   assert(begin[0] == '\'' && "Invalid token lexed");
@@ -730,8 +781,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
 
       // Is this a Universal Character Name escape?
     if (begin[0] != '\\')     // If this is a normal character, consume it.
-      ResultChar = *begin++;
+      ResultChar = (unsigned char)*begin++;
     else {                    // Otherwise, this is an escape character.
+      unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
       // Check for UCN.
       if (begin[1] == 'u' || begin[1] == 'U') {
         uint32_t utf32 = 0;
@@ -742,19 +794,22 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
           HadError = 1;
         }
         ResultChar = utf32;
+        if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
+          PP.Diag(Loc, diag::warn_ucn_escape_too_large);
+          ResultChar &= ~0U >> (32-CharWidth);
+        }
       } else {
         // Otherwise, this is a non-UCN escape character.  Process it.
         ResultChar = ProcessCharEscape(begin, end, HadError,
                                        FullSourceLoc(Loc,PP.getSourceManager()),
-                                       IsWide,
-                                       &PP.getDiagnostics(), PP.getTargetInfo());
+                                       CharWidth, &PP.getDiagnostics());
       }
     }
 
     // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
     // implementation defined (C99 6.4.4.4p10).
     if (NumCharsSoFar) {
-      if (IsWide) {
+      if (!isAscii()) {
         // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
         LitVal = 0;
       } else {
@@ -776,8 +831,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
   if (NumCharsSoFar > 1) {
     // Warn about discarding the top bits for multi-char wide-character
     // constants (L'abcd').
-    if (IsWide)
-      PP.Diag(Loc, diag::warn_extraneous_wide_char_constant);
+    if (!isAscii())
+      PP.Diag(Loc, diag::warn_extraneous_char_constant);
     else if (NumCharsSoFar != 4)
       PP.Diag(Loc, diag::ext_multichar_character_literal);
     else
@@ -789,47 +844,62 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
   // Transfer the value from APInt to uint64_t
   Value = LitVal.getZExtValue();
 
-  if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF)
-    PP.Diag(Loc, diag::warn_ucn_escape_too_large);
-
   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
   // character constants are not sign extended in the this implementation:
   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
-  if (!IsWide && NumCharsSoFar == 1 && (Value & 128) &&
+  if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
       PP.getLangOptions().CharIsSigned)
     Value = (signed char)Value;
 }
 
 
-///       string-literal: [C99 6.4.5]
-///          " [s-char-sequence] "
-///         L" [s-char-sequence] "
+///       string-literal: [C++0x lex.string]
+///         encoding-prefix " [s-char-sequence] "
+///         encoding-prefix R raw-string
+///       encoding-prefix:
+///         u8
+///         u
+///         U
+///         L
 ///       s-char-sequence:
 ///         s-char
 ///         s-char-sequence s-char
 ///       s-char:
-///         any source character except the double quote ",
-///           backslash \, or newline character
-///         escape-character
-///         universal-character-name
-///       escape-character: [C99 6.4.4.4]
-///         \ escape-code
+///         any member of the source character set except the double-quote ",
+///           backslash \, or new-line character
+///         escape-sequence
 ///         universal-character-name
-///       escape-code:
-///         character-escape-code
-///         octal-escape-code
-///         hex-escape-code
-///       character-escape-code: one of
-///         n t b r f v a
-///         \ ' " ?
-///       octal-escape-code:
-///         octal-digit
-///         octal-digit octal-digit
-///         octal-digit octal-digit octal-digit
-///       hex-escape-code:
-///         x hex-digit
-///         hex-escape-code hex-digit
+///       raw-string:
+///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
+///       r-char-sequence:
+///         r-char
+///         r-char-sequence r-char
+///       r-char:
+///         any member of the source character set, except a right parenthesis )
+///           followed by the initial d-char-sequence (which may be empty)
+///           followed by a double quote ".
+///       d-char-sequence:
+///         d-char
+///         d-char-sequence d-char
+///       d-char:
+///         any member of the basic source character set except:
+///           space, the left parenthesis (, the right parenthesis ),
+///           the backslash \, and the control characters representing horizontal
+///           tab, vertical tab, form feed, and newline.
+///       escape-sequence: [C++0x lex.ccon]
+///         simple-escape-sequence
+///         octal-escape-sequence
+///         hexadecimal-escape-sequence
+///       simple-escape-sequence:
+///         one of \' \" \? \\ \a \b \f \n \r \t \v
+///       octal-escape-sequence:
+///         \ octal-digit
+///         \ octal-digit octal-digit
+///         \ octal-digit octal-digit octal-digit
+///       hexadecimal-escape-sequence:
+///         \x hexadecimal-digit
+///         hexadecimal-escape-sequence hexadecimal-digit
 ///       universal-character-name:
 ///         \u hex-quad
 ///         \U hex-quad hex-quad
@@ -841,8 +911,8 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
                     Preprocessor &PP, bool Complain)
   : SM(PP.getSourceManager()), Features(PP.getLangOptions()),
     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
-    MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0),
-    ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) {
+    MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
+    ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
   init(StringToks, NumStringToks);
 }
 
@@ -862,7 +932,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
   MaxTokenLength = StringToks[0].getLength();
   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
-  AnyWide = StringToks[0].is(tok::wide_string_literal);
+  Kind = StringToks[0].getKind();
 
   hadError = false;
 
@@ -883,8 +953,18 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
     if (StringToks[i].getLength() > MaxTokenLength)
       MaxTokenLength = StringToks[i].getLength();
 
-    // Remember if we see any wide strings.
-    AnyWide |= StringToks[i].is(tok::wide_string_literal);
+    // Remember if we see any wide or utf-8/16/32 strings.
+    // Also check for illegal concatenations.
+    if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
+      if (isAscii()) {
+        Kind = StringToks[i].getKind();
+      } else {
+        if (Diags)
+          Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
+                        diag::err_unsupported_string_concat);
+        hadError = true;
+      }
+    }
   }
 
   // Include space for the null terminator.
@@ -892,19 +972,14 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
 
   // TODO: K&R warning: "traditional C rejects string constant concatenation"
 
-  // Get the width in bytes of wchar_t.  If no wchar_t strings are used, do not
-  // query the target.  As such, wchar_tByteWidth is only valid if AnyWide=true.
-  wchar_tByteWidth = ~0U;
-  if (AnyWide) {
-    wchar_tByteWidth = Target.getWCharWidth();
-    assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
-    wchar_tByteWidth /= 8;
-  }
+  // Get the width in bytes of char/wchar_t/char16_t/char32_t
+  CharByteWidth = getCharWidth(Kind, Target);
+  assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
+  CharByteWidth /= 8;
 
   // The output buffer size needs to be large enough to hold wide characters.
   // This is a worst-case assumption which basically corresponds to L"" "long".
-  if (AnyWide)
-    SizeBound *= wchar_tByteWidth;
+  SizeBound *= CharByteWidth;
 
   // Size the temporary buffer to hold the result string data.
   ResultBuf.resize(SizeBound);
@@ -929,78 +1004,82 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
                          &StringInvalid);
     if (StringInvalid) {
-      hadError = 1;
+      hadError = true;
       continue;
     }
 
     const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
-    bool wide = false;
     // TODO: Input character set mapping support.
 
-    // Skip L marker for wide strings.
-    if (ThisTokBuf[0] == 'L') {
-      wide = true;
+    // Skip marker for wide or unicode strings.
+    if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
       ++ThisTokBuf;
+      // Skip 8 of u8 marker for utf8 strings.
+      if (ThisTokBuf[0] == '8')
+        ++ThisTokBuf;
     }
 
-    assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
-    ++ThisTokBuf;
-
-    // Check if this is a pascal string
-    if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
-        ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
+    // Check for raw string
+    if (ThisTokBuf[0] == 'R') {
+      ThisTokBuf += 2; // skip R"
 
-      // If the \p sequence is found in the first token, we have a pascal string
-      // Otherwise, if we already have a pascal string, ignore the first \p
-      if (i == 0) {
+      const char *Prefix = ThisTokBuf;
+      while (ThisTokBuf[0] != '(')
         ++ThisTokBuf;
-        Pascal = true;
-      } else if (Pascal)
-        ThisTokBuf += 2;
-    }
+      ++ThisTokBuf; // skip '('
+
+      // remove same number of characters from the end
+      if (ThisTokEnd >= ThisTokBuf + (ThisTokBuf - Prefix))
+        ThisTokEnd -= (ThisTokBuf - Prefix);
+
+      // Copy the string over
+      CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf));
+    } else {
+      assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
+      ++ThisTokBuf; // skip "
+
+      // Check if this is a pascal string
+      if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
+          ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
 
-    while (ThisTokBuf != ThisTokEnd) {
-      // Is this a span of non-escape characters?
-      if (ThisTokBuf[0] != '\\') {
-        const char *InStart = ThisTokBuf;
-        do {
+        // If the \p sequence is found in the first token, we have a pascal string
+        // Otherwise, if we already have a pascal string, ignore the first \p
+        if (i == 0) {
           ++ThisTokBuf;
-        } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
-
-        // Copy the character span over.
-        unsigned Len = ThisTokBuf-InStart;
-        if (!AnyWide) {
-          memcpy(ResultPtr, InStart, Len);
-          ResultPtr += Len;
-        } else {
-          // Note: our internal rep of wide char tokens is always little-endian.
-          for (; Len; --Len, ++InStart) {
-            *ResultPtr++ = InStart[0];
-            // Add zeros at the end.
-            for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-              *ResultPtr++ = 0;
-          }
-        }
-        continue;
-      }
-      // Is this a Universal Character Name escape?
-      if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
-        EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
-                        hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
-                        wide, Diags, Features);
-        continue;
+          Pascal = true;
+        } else if (Pascal)
+          ThisTokBuf += 2;
       }
-      // Otherwise, this is a non-UCN escape character.  Process it.
-      unsigned ResultChar =
-        ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
-                          FullSourceLoc(StringToks[i].getLocation(), SM),
-                          AnyWide, Diags, Target);
 
-      // Note: our internal rep of wide char tokens is always little-endian.
-      *ResultPtr++ = ResultChar & 0xFF;
+      while (ThisTokBuf != ThisTokEnd) {
+        // Is this a span of non-escape characters?
+        if (ThisTokBuf[0] != '\\') {
+          const char *InStart = ThisTokBuf;
+          do {
+            ++ThisTokBuf;
+          } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
+
+          // Copy the character span over.
+          CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart));
+          continue;
+        }
+        // Is this a Universal Character Name escape?
+        if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
+          EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
+                          hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
+                          CharByteWidth, Diags, Features);
+          continue;
+        }
+        // Otherwise, this is a non-UCN escape character.  Process it.
+        unsigned ResultChar =
+          ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
+                            FullSourceLoc(StringToks[i].getLocation(), SM),
+                            CharByteWidth*8, Diags);
+
+        // Note: our internal rep of wide char tokens is always little-endian.
+        *ResultPtr++ = ResultChar & 0xFF;
 
-      if (AnyWide) {
-        for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+        for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
           *ResultPtr++ = ResultChar >> i*8;
       }
     }
@@ -1008,8 +1087,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
 
   if (Pascal) {
     ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
-    if (AnyWide)
-      ResultBuf[0] /= wchar_tByteWidth;
+    ResultBuf[0] /= CharByteWidth;
 
     // Verify that pascal strings aren't too large.
     if (GetStringLength() > 256) {
@@ -1018,7 +1096,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
                       diag::err_pascal_string_too_long)
           << SourceRange(StringToks[0].getLocation(),
                          StringToks[NumStringToks-1].getLocation());
-      hadError = 1;
+      hadError = true;
       return;
     }
   } else if (Diags) {
@@ -1036,6 +1114,25 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
 }
 
 
+/// copyStringFragment - This function copies from Start to End into ResultPtr.
+/// Performs widening for multi-byte characters.
+void StringLiteralParser::CopyStringFragment(StringRef Fragment) {
+  // Copy the character span over.
+  if (CharByteWidth == 1) {
+    memcpy(ResultPtr, Fragment.data(), Fragment.size());
+    ResultPtr += Fragment.size();
+  } else {
+    // Note: our internal rep of wide char tokens is always little-endian.
+    for (StringRef::iterator I=Fragment.begin(), E=Fragment.end(); I!=E; ++I) {
+      *ResultPtr++ = *I;
+      // Add zeros at the end.
+      for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
+        *ResultPtr++ = 0;
+    }
+  }
+}
+
+
 /// getOffsetOfStringByte - This function returns the offset of the
 /// specified byte of the string data represented by Token.  This handles
 /// advancing over escape sequences in the string.
@@ -1052,7 +1149,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
   if (StringInvalid)
     return 0;
 
-  assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet");
+  assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
+         SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
 
 
   const char *SpellingStart = SpellingPtr;
@@ -1077,7 +1175,7 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
     bool HadError = false;
     ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
                       FullSourceLoc(Tok.getLocation(), SM),
-                      false, Diags, Target);
+                      CharByteWidth*8, Diags);
     assert(!HadError && "This method isn't valid on erroneous strings");
     --ByteNo;
   }