summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJessica Hamilton <jessica.l.hamilton@gmail.com>2017-01-11 16:00:08 (GMT)
committerJessica Hamilton <jessica.l.hamilton@gmail.com>2017-01-11 17:39:55 (GMT)
commit92b9c8649b57a49d1880a9025611797cad7e9bec (patch)
tree9c2a06852367454c9ee525d7f08e9d4edd46ac4e
parentd2423e4b3c976c4fab0412afded816a3591db333 (diff)
MultibyteToWchar: correctly handle UTF-16 surrogate pairs.hrev50859
* Whilst in WcharToMultibyte, we correctly convert our UTF-32 wchar characters to multibyte, the same wasn't done in MultibyteToWchar. Now, if we detect a leading surrogate, we'll re-read the multibyte sequence with space for a UTF-16 pair, which allows U16_GET to correctly convert the UTF-16 byte sequence into the needed UTF-32 codepoint. Fixes #13184.
-rw-r--r--src/system/libroot/add-ons/icu/ICUCtypeData.cpp17
1 files changed, 15 insertions, 2 deletions
diff --git a/src/system/libroot/add-ons/icu/ICUCtypeData.cpp b/src/system/libroot/add-ons/icu/ICUCtypeData.cpp
index 3940c80..4ff11ac 100644
--- a/src/system/libroot/add-ons/icu/ICUCtypeData.cpp
+++ b/src/system/libroot/add-ons/icu/ICUCtypeData.cpp
@@ -223,13 +223,26 @@ ICUCtypeData::MultibyteToWchar(wchar_t* wcOut, const char* mb, size_t mbLen,
UErrorCode icuStatus = U_ZERO_ERROR;
const char* buffer = mb;
- UChar targetBuffer[2];
+ UChar targetBuffer[3];
UChar* target = targetBuffer;
ucnv_toUnicode(converter, &target, target + 1, &buffer, buffer + mbLen,
NULL, FALSE, &icuStatus);
size_t sourceLengthUsed = buffer - mb;
size_t targetLengthUsed = (size_t)(target - targetBuffer);
+ if (U16_IS_LEAD(targetBuffer[0])) {
+ // we have a surrogate pair, so re-read with enough space for a pair
+ // of characters instead
+ TRACE(("MultibyteToWchar(): have a surrogate pair\n"));
+ ucnv_resetToUnicode(converter);
+ buffer = mb;
+ target = targetBuffer;
+ ucnv_toUnicode(converter, &target, target + 2, &buffer, buffer + mbLen,
+ NULL, FALSE, &icuStatus);
+ sourceLengthUsed = buffer - mb;
+ targetLengthUsed = (size_t)(target - targetBuffer);
+ }
+
if (icuStatus == U_BUFFER_OVERFLOW_ERROR && targetLengthUsed > 0) {
// we've got one character, which is all that we wanted
icuStatus = U_ZERO_ERROR;
@@ -248,7 +261,7 @@ ICUCtypeData::MultibyteToWchar(wchar_t* wcOut, const char* mb, size_t mbLen,
result = B_BAD_INDEX;
} else {
UChar32 unicodeChar = 0xBADBEEF;
- U16_GET(targetBuffer, 0, 0, 2, unicodeChar);
+ U16_GET(targetBuffer, 0, 0, targetLengthUsed, unicodeChar);
if (unicodeChar == 0) {
// reset to initial state