1 // Scintilla source code edit control 2 /** @file UniConversion.cxx 3 ** Functions to handle UTF-8 and UTF-16 strings. 4 **/ 5 // Copyright 1998-2001 by Neil Hodgson <[email protected]> 6 // The License.txt file describes the conditions under which this software may be distributed. 7 8 #include <cstdlib> 9 10 #include <stdexcept> 11 #include <string> 12 #include <string_view> 13 14 #include "UniConversion.h" 15 16 using namespace Scintilla; 17 18 namespace Scintilla { 19 20 size_t UTF8Length(std::wstring_view wsv) noexcept { 21 size_t len = 0; 22 for (size_t i = 0; i < wsv.length() && wsv[i];) { 23 const unsigned int uch = wsv[i]; 24 if (uch < 0x80) { 25 len++; 26 } else if (uch < 0x800) { 27 len += 2; 28 } else if ((uch >= SURROGATE_LEAD_FIRST) && 29 (uch <= SURROGATE_TRAIL_LAST)) { 30 len += 4; 31 i++; 32 } else { 33 len += 3; 34 } 35 i++; 36 } 37 return len; 38 } 39 40 size_t UTF8PositionFromUTF16Position(std::string_view u8Text, size_t positionUTF16) noexcept { 41 size_t positionUTF8 = 0; 42 for (size_t lengthUTF16 = 0; (positionUTF8 < u8Text.length()) && (lengthUTF16 < positionUTF16);) { 43 const unsigned char uch = u8Text[positionUTF8]; 44 const unsigned int byteCount = UTF8BytesOfLead[uch]; 45 lengthUTF16 += UTF16LengthFromUTF8ByteCount(byteCount); 46 positionUTF8 += byteCount; 47 } 48 49 return positionUTF8; 50 } 51 52 void UTF8FromUTF16(std::wstring_view wsv, char *putf, size_t len) noexcept { 53 size_t k = 0; 54 for (size_t i = 0; i < wsv.length() && wsv[i];) { 55 const unsigned int uch = wsv[i]; 56 if (uch < 0x80) { 57 putf[k++] = static_cast<char>(uch); 58 } else if (uch < 0x800) { 59 putf[k++] = static_cast<char>(0xC0 | (uch >> 6)); 60 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); 61 } else if ((uch >= SURROGATE_LEAD_FIRST) && 62 (uch <= SURROGATE_TRAIL_LAST)) { 63 // Half a surrogate pair 64 i++; 65 const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (wsv[i] & 0x3ff); 66 putf[k++] = static_cast<char>(0xF0 | (xch >> 18)); 67 putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f)); 68 putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f)); 69 putf[k++] = static_cast<char>(0x80 | (xch & 0x3f)); 70 } else { 71 putf[k++] = static_cast<char>(0xE0 | (uch >> 12)); 72 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f)); 73 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); 74 } 75 i++; 76 } 77 if (k < len) 78 putf[k] = '\0'; 79 } 80 81 void UTF8FromUTF32Character(int uch, char *putf) noexcept { 82 size_t k = 0; 83 if (uch < 0x80) { 84 putf[k++] = static_cast<char>(uch); 85 } else if (uch < 0x800) { 86 putf[k++] = static_cast<char>(0xC0 | (uch >> 6)); 87 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); 88 } else if (uch < 0x10000) { 89 putf[k++] = static_cast<char>(0xE0 | (uch >> 12)); 90 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f)); 91 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); 92 } else { 93 putf[k++] = static_cast<char>(0xF0 | (uch >> 18)); 94 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f)); 95 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f)); 96 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f)); 97 } 98 putf[k] = '\0'; 99 } 100 101 size_t UTF16Length(std::string_view svu8) noexcept { 102 size_t ulen = 0; 103 for (size_t i = 0; i< svu8.length();) { 104 const unsigned char ch = svu8[i]; 105 const unsigned int byteCount = UTF8BytesOfLead[ch]; 106 const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount); 107 i += byteCount; 108 ulen += (i > svu8.length()) ? 1 : utf16Len; 109 } 110 return ulen; 111 } 112 113 constexpr unsigned char TrailByteValue(unsigned char c) { 114 // The top 2 bits are 0b10 to indicate a trail byte. 115 // The lower 6 bits contain the value. 116 return c & 0b0011'1111; 117 } 118 119 size_t UTF16FromUTF8(std::string_view svu8, wchar_t *tbuf, size_t tlen) { 120 size_t ui = 0; 121 for (size_t i = 0; i < svu8.length();) { 122 unsigned char ch = svu8[i]; 123 const unsigned int byteCount = UTF8BytesOfLead[ch]; 124 unsigned int value; 125 126 if (i + byteCount > svu8.length()) { 127 // Trying to read past end but still have space to write 128 if (ui < tlen) { 129 tbuf[ui] = ch; 130 ui++; 131 } 132 break; 133 } 134 135 const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount); 136 if (ui + outLen > tlen) { 137 throw std::runtime_error("UTF16FromUTF8: attempted write beyond end"); 138 } 139 140 i++; 141 switch (byteCount) { 142 case 1: 143 tbuf[ui] = ch; 144 break; 145 case 2: 146 value = (ch & 0x1F) << 6; 147 ch = svu8[i++]; 148 value += TrailByteValue(ch); 149 tbuf[ui] = static_cast<wchar_t>(value); 150 break; 151 case 3: 152 value = (ch & 0xF) << 12; 153 ch = svu8[i++]; 154 value += (TrailByteValue(ch) << 6); 155 ch = svu8[i++]; 156 value += TrailByteValue(ch); 157 tbuf[ui] = static_cast<wchar_t>(value); 158 break; 159 default: 160 // Outside the BMP so need two surrogates 161 value = (ch & 0x7) << 18; 162 ch = svu8[i++]; 163 value += TrailByteValue(ch) << 12; 164 ch = svu8[i++]; 165 value += TrailByteValue(ch) << 6; 166 ch = svu8[i++]; 167 value += TrailByteValue(ch); 168 tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST); 169 ui++; 170 tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST); 171 break; 172 } 173 ui++; 174 } 175 return ui; 176 } 177 178 size_t UTF32Length(std::string_view svu8) noexcept { 179 size_t ulen = 0; 180 for (size_t i = 0; i < svu8.length();) { 181 const unsigned char ch = svu8[i]; 182 const unsigned int byteCount = UTF8BytesOfLead[ch]; 183 i += byteCount; 184 ulen++; 185 } 186 return ulen; 187 } 188 189 size_t UTF32FromUTF8(std::string_view svu8, unsigned int *tbuf, size_t tlen) { 190 size_t ui = 0; 191 for (size_t i = 0; i < svu8.length();) { 192 unsigned char ch = svu8[i]; 193 const unsigned int byteCount = UTF8BytesOfLead[ch]; 194 unsigned int value; 195 196 if (i + byteCount > svu8.length()) { 197 // Trying to read past end but still have space to write 198 if (ui < tlen) { 199 tbuf[ui] = ch; 200 ui++; 201 } 202 break; 203 } 204 205 if (ui == tlen) { 206 throw std::runtime_error("UTF32FromUTF8: attempted write beyond end"); 207 } 208 209 i++; 210 switch (byteCount) { 211 case 1: 212 value = ch; 213 break; 214 case 2: 215 value = (ch & 0x1F) << 6; 216 ch = svu8[i++]; 217 value += TrailByteValue(ch); 218 break; 219 case 3: 220 value = (ch & 0xF) << 12; 221 ch = svu8[i++]; 222 value += TrailByteValue(ch) << 6; 223 ch = svu8[i++]; 224 value += TrailByteValue(ch); 225 break; 226 default: 227 value = (ch & 0x7) << 18; 228 ch = svu8[i++]; 229 value += TrailByteValue(ch) << 12; 230 ch = svu8[i++]; 231 value += TrailByteValue(ch) << 6; 232 ch = svu8[i++]; 233 value += TrailByteValue(ch); 234 break; 235 } 236 tbuf[ui] = value; 237 ui++; 238 } 239 return ui; 240 } 241 242 std::wstring WStringFromUTF8(std::string_view svu8) { 243 if constexpr (sizeof(wchar_t) == 2) { 244 const size_t len16 = UTF16Length(svu8); 245 std::wstring ws(len16, 0); 246 UTF16FromUTF8(svu8, &ws[0], len16); 247 return ws; 248 } else { 249 const size_t len32 = UTF32Length(svu8); 250 std::wstring ws(len32, 0); 251 UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(&ws[0]), len32); 252 return ws; 253 } 254 } 255 256 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept { 257 if (val < SUPPLEMENTAL_PLANE_FIRST) { 258 tbuf[0] = static_cast<wchar_t>(val); 259 return 1; 260 } else { 261 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST); 262 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST); 263 return 2; 264 } 265 } 266 267 const unsigned char UTF8BytesOfLead[256] = { 268 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F 269 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F 270 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F 271 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F 272 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F 273 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F 274 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F 275 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F 276 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F 277 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F 278 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF 279 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF 280 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF 281 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF 282 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF 283 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF 284 }; 285 286 // Return both the width of the first character in the string and a status 287 // saying whether it is valid or invalid. 288 // Most invalid sequences return a width of 1 so are treated as isolated bytes but 289 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be 290 // reasonably treated as code points in some circumstances. They will, however, 291 // not have associated glyphs. 292 int UTF8Classify(const unsigned char *us, size_t len) noexcept { 293 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 294 if (us[0] < 0x80) { 295 // ASCII 296 return 1; 297 } 298 299 const size_t byteCount = UTF8BytesOfLead[us[0]]; 300 if (byteCount == 1 || byteCount > len) { 301 // Invalid lead byte 302 return UTF8MaskInvalid | 1; 303 } 304 305 if (!UTF8IsTrailByte(us[1])) { 306 // Invalid trail byte 307 return UTF8MaskInvalid | 1; 308 } 309 310 switch (byteCount) { 311 case 2: 312 return 2; 313 314 case 3: 315 if (UTF8IsTrailByte(us[2])) { 316 if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) { 317 // Overlong 318 return UTF8MaskInvalid | 1; 319 } 320 if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) { 321 // Surrogate 322 return UTF8MaskInvalid | 1; 323 } 324 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) { 325 // U+FFFE non-character - 3 bytes long 326 return UTF8MaskInvalid | 3; 327 } 328 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) { 329 // U+FFFF non-character - 3 bytes long 330 return UTF8MaskInvalid | 3; 331 } 332 if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) { 333 // U+FDD0 .. U+FDEF 334 return UTF8MaskInvalid | 3; 335 } 336 return 3; 337 } 338 break; 339 340 default: 341 if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) { 342 if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { 343 // *FFFE or *FFFF non-character 344 return UTF8MaskInvalid | 4; 345 } 346 if (*us == 0xf4) { 347 // Check if encoding a value beyond the last Unicode character 10FFFF 348 if (us[1] > 0x8f) { 349 return UTF8MaskInvalid | 1; 350 } 351 } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { 352 // Overlong 353 return UTF8MaskInvalid | 1; 354 } 355 return 4; 356 } 357 break; 358 } 359 360 return UTF8MaskInvalid | 1; 361 } 362 363 int UTF8DrawBytes(const unsigned char *us, int len) noexcept { 364 const int utf8StatusNext = UTF8Classify(us, len); 365 return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth); 366 } 367 368 bool UTF8IsValid(std::string_view svu8) noexcept { 369 const unsigned char *us = reinterpret_cast<const unsigned char *>(svu8.data()); 370 size_t remaining = svu8.length(); 371 while (remaining > 0) { 372 const int utf8Status = UTF8Classify(us, remaining); 373 if (utf8Status & UTF8MaskInvalid) { 374 return false; 375 } else { 376 const int lenChar = utf8Status & UTF8MaskWidth; 377 us += lenChar; 378 remaining -= lenChar; 379 } 380 } 381 return remaining == 0; 382 } 383 384 // Replace invalid bytes in UTF-8 with the replacement character 385 std::string FixInvalidUTF8(const std::string &text) { 386 std::string result; 387 const char *s = text.c_str(); 388 size_t remaining = text.size(); 389 while (remaining > 0) { 390 const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining); 391 if (utf8Status & UTF8MaskInvalid) { 392 // Replacement character 0xFFFD = UTF8:"efbfbd". 393 result.append("\xef\xbf\xbd"); 394 s++; 395 remaining--; 396 } else { 397 const size_t len = utf8Status & UTF8MaskWidth; 398 result.append(s, len); 399 s += len; 400 remaining -= len; 401 } 402 } 403 return result; 404 } 405 406 } 407