1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "Locale.h" 18 19 #include <algorithm> 20 21 #include <hb.h> 22 23 #include "minikin/LocaleList.h" 24 25 #include "LocaleListCache.h" 26 #include "MinikinInternal.h" 27 #include "StringPiece.h" 28 29 namespace minikin { 30 31 constexpr uint32_t FIVE_BITS = 0x1f; 32 registerLocaleList(const std::string & locales)33 uint32_t registerLocaleList(const std::string& locales) { 34 return LocaleListCache::getId(locales); 35 } 36 getLocaleString(uint32_t localeId)37 std::string getLocaleString(uint32_t localeId) { 38 const LocaleList& localeList = LocaleListCache::getById(localeId); 39 std::string out; 40 for (size_t i = 0; i < localeList.size(); ++i) { 41 if (i != 0) { 42 out += ","; 43 } 44 out += localeList[i].getString(); 45 } 46 return out; 47 } 48 49 // Check if a language code supports extension such as emoji and line break etc. according to its 50 // subtag isSubtag(const char * buf,size_t bufLen,const char * subtag,size_t subtagLen)51 static bool isSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) { 52 if (bufLen < subtagLen) { 53 return false; 54 } 55 if (strncmp(buf, subtag, subtagLen) != 0) { 56 return false; // no match between two strings 57 } 58 return (bufLen == subtagLen || buf[subtagLen] == '\0' || buf[subtagLen] == '-' || 59 buf[subtagLen] == '_'); 60 } 61 62 // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0. 63 // For the region code, the letters must be all digits in three letter case, so the number of 64 // possible values are 10. For the language code, the letters must be all small alphabets, so the 65 // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the 66 // three letter language code or region code to 15 bits. 67 // 68 // In case of two letter code, use fullbit(0x1f) for the first letter instead. packLanguageOrRegion(const StringPiece & in,uint8_t twoLetterBase,uint8_t threeLetterBase)69 static uint16_t packLanguageOrRegion(const StringPiece& in, uint8_t twoLetterBase, 70 uint8_t threeLetterBase) { 71 if (in.length() == 2) { 72 return 0x7c00u | // 0x1fu << 10 73 (uint16_t)(in[0] - twoLetterBase) << 5 | (uint16_t)(in[1] - twoLetterBase); 74 } else { 75 return ((uint16_t)(in[0] - threeLetterBase) << 10) | 76 (uint16_t)(in[1] - threeLetterBase) << 5 | (uint16_t)(in[2] - threeLetterBase); 77 } 78 } 79 unpackLanguageOrRegion(uint16_t in,char * out,uint8_t twoLetterBase,uint8_t threeLetterBase)80 static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase, 81 uint8_t threeLetterBase) { 82 uint8_t first = (in >> 10) & FIVE_BITS; 83 uint8_t second = (in >> 5) & FIVE_BITS; 84 uint8_t third = in & FIVE_BITS; 85 86 if (first == 0x1f) { 87 out[0] = second + twoLetterBase; 88 out[1] = third + twoLetterBase; 89 return 2; 90 } else { 91 out[0] = first + threeLetterBase; 92 out[1] = second + threeLetterBase; 93 out[2] = third + threeLetterBase; 94 return 3; 95 } 96 } 97 packLanguage(const StringPiece & in)98 static uint16_t packLanguage(const StringPiece& in) { 99 return packLanguageOrRegion(in, 'a', 'a'); 100 } 101 unpackLanguage(uint16_t in,char * out)102 static size_t unpackLanguage(uint16_t in, char* out) { 103 return unpackLanguageOrRegion(in, out, 'a', 'a'); 104 } 105 packScript(char c1,char c2,char c3,char c4)106 constexpr uint32_t packScript(char c1, char c2, char c3, char c4) { 107 constexpr char FIRST_LETTER_BASE = 'A'; 108 constexpr char REST_LETTER_BASE = 'a'; 109 return ((uint32_t)(c1 - FIRST_LETTER_BASE) << 15) | (uint32_t)(c2 - REST_LETTER_BASE) << 10 | 110 ((uint32_t)(c3 - REST_LETTER_BASE) << 5) | (uint32_t)(c4 - REST_LETTER_BASE); 111 } 112 packScript(uint32_t script)113 constexpr uint32_t packScript(uint32_t script) { 114 return packScript(script >> 24, (script >> 16) & 0xff, (script >> 8) & 0xff, script & 0xff); 115 } 116 unpackScript(uint32_t packedScript)117 constexpr uint32_t unpackScript(uint32_t packedScript) { 118 constexpr char FIRST_LETTER_BASE = 'A'; 119 constexpr char REST_LETTER_BASE = 'a'; 120 const uint32_t first = (packedScript >> 15) + FIRST_LETTER_BASE; 121 const uint32_t second = ((packedScript >> 10) & FIVE_BITS) + REST_LETTER_BASE; 122 const uint32_t third = ((packedScript >> 5) & FIVE_BITS) + REST_LETTER_BASE; 123 const uint32_t fourth = (packedScript & FIVE_BITS) + REST_LETTER_BASE; 124 125 return first << 24 | second << 16 | third << 8 | fourth; 126 } 127 packRegion(const StringPiece & in)128 static uint16_t packRegion(const StringPiece& in) { 129 return packLanguageOrRegion(in, 'A', '0'); 130 } 131 unpackRegion(uint16_t in,char * out)132 static size_t unpackRegion(uint16_t in, char* out) { 133 return unpackLanguageOrRegion(in, out, 'A', '0'); 134 } 135 isLowercase(char c)136 static inline bool isLowercase(char c) { 137 return 'a' <= c && c <= 'z'; 138 } 139 isUppercase(char c)140 static inline bool isUppercase(char c) { 141 return 'A' <= c && c <= 'Z'; 142 } 143 isDigit(char c)144 static inline bool isDigit(char c) { 145 return '0' <= c && c <= '9'; 146 } 147 148 // Returns true if the buffer is valid for language code. isValidLanguageCode(const StringPiece & buffer)149 static inline bool isValidLanguageCode(const StringPiece& buffer) { 150 if (buffer.length() != 2 && buffer.length() != 3) return false; 151 if (!isLowercase(buffer[0])) return false; 152 if (!isLowercase(buffer[1])) return false; 153 if (buffer.length() == 3 && !isLowercase(buffer[2])) return false; 154 return true; 155 } 156 157 // Returns true if buffer is valid for script code. The length of buffer must be 4. isValidScriptCode(const StringPiece & buffer)158 static inline bool isValidScriptCode(const StringPiece& buffer) { 159 return buffer.size() == 4 && isUppercase(buffer[0]) && isLowercase(buffer[1]) && 160 isLowercase(buffer[2]) && isLowercase(buffer[3]); 161 } 162 163 // Returns true if the buffer is valid for region code. isValidRegionCode(const StringPiece & buffer)164 static inline bool isValidRegionCode(const StringPiece& buffer) { 165 return (buffer.size() == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) || 166 (buffer.size() == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2])); 167 } 168 169 // Parse BCP 47 language identifier into internal structure Locale(const StringPiece & input)170 Locale::Locale(const StringPiece& input) : Locale() { 171 SplitIterator it(input, '-'); 172 173 StringPiece language = it.next(); 174 if (isValidLanguageCode(language)) { 175 mLanguage = packLanguage(language); 176 } else { 177 // We don't understand anything other than two-letter or three-letter 178 // language codes, so we skip parsing the rest of the string. 179 return; 180 } 181 182 if (!it.hasNext()) { 183 return; // Language code only. 184 } 185 StringPiece token = it.next(); 186 187 if (isValidScriptCode(token)) { 188 mScript = packScript(token[0], token[1], token[2], token[3]); 189 mSubScriptBits = scriptToSubScriptBits(mScript); 190 191 if (!it.hasNext()) { 192 goto finalize; // No variant, emoji subtag and region code. 193 } 194 token = it.next(); 195 } 196 197 if (isValidRegionCode(token)) { 198 mRegion = packRegion(token); 199 200 if (!it.hasNext()) { 201 goto finalize; // No variant or emoji subtag. 202 } 203 token = it.next(); 204 } 205 206 if (language == "de") { // We are only interested in German variants. 207 if (token == "1901") { 208 mVariant = Variant::GERMAN_1901_ORTHOGRAPHY; 209 } else if (token == "1996") { 210 mVariant = Variant::GERMAN_1996_ORTHOGRAPHY; 211 } 212 213 if (mVariant != Variant::NO_VARIANT) { 214 if (!it.hasNext()) { 215 goto finalize; // No emoji subtag. 216 } 217 218 token = it.next(); 219 } 220 } 221 222 resolveUnicodeExtension(input.data(), input.length()); 223 224 finalize: 225 if (mEmojiStyle == EmojiStyle::EMPTY) { 226 mEmojiStyle = scriptToEmojiStyle(mScript); 227 } 228 } 229 resolveUnicodeExtension(const char * buf,size_t length)230 void Locale::resolveUnicodeExtension(const char* buf, size_t length) { 231 static const char kPrefix[] = "-u-"; 232 const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix)); 233 if (pos != buf + length) { 234 pos += strlen(kPrefix); 235 const size_t remainingLength = length - (pos - buf); 236 mEmojiStyle = resolveEmojiStyle(pos, remainingLength); 237 } 238 } 239 240 // static 241 // Lookup emoji subtag and determine the emoji style. resolveEmojiStyle(const char * buf,size_t length)242 EmojiStyle Locale::resolveEmojiStyle(const char* buf, size_t length) { 243 // 7 is the length of "-u-em-text", which is the shortest emoji subtag, 244 // unnecessary comparison can be avoided if total length is smaller than 10. 245 const size_t kMinSubtagLength = 7; 246 if (length >= kMinSubtagLength) { 247 static const char kPrefix[] = "em-"; 248 const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix)); 249 if (pos != buf + length) { // found 250 pos += strlen(kPrefix); 251 const size_t remainingLength = length - (pos - buf); 252 if (isSubtag(pos, remainingLength, "emoji", 5)) { 253 return EmojiStyle::EMOJI; 254 } else if (isSubtag(pos, remainingLength, "text", 4)) { 255 return EmojiStyle::TEXT; 256 } else if (isSubtag(pos, remainingLength, "default", 7)) { 257 return EmojiStyle::DEFAULT; 258 } 259 } 260 } 261 return EmojiStyle::EMPTY; 262 } 263 scriptToEmojiStyle(uint32_t script)264 EmojiStyle Locale::scriptToEmojiStyle(uint32_t script) { 265 // If no emoji subtag was provided, resolve the emoji style from script code. 266 if (script == packScript('Z', 's', 'y', 'e')) { 267 return EmojiStyle::EMOJI; 268 } else if (script == packScript('Z', 's', 'y', 'm')) { 269 return EmojiStyle::TEXT; 270 } 271 return EmojiStyle::EMPTY; 272 } 273 274 // static scriptToSubScriptBits(uint32_t script)275 uint8_t Locale::scriptToSubScriptBits(uint32_t script) { 276 uint8_t subScriptBits = 0u; 277 switch (script) { 278 case packScript('B', 'o', 'p', 'o'): 279 subScriptBits = kBopomofoFlag; 280 break; 281 case packScript('H', 'a', 'n', 'g'): 282 subScriptBits = kHangulFlag; 283 break; 284 case packScript('H', 'a', 'n', 'b'): 285 // Bopomofo is almost exclusively used in Taiwan. 286 subScriptBits = kHanFlag | kBopomofoFlag; 287 break; 288 case packScript('H', 'a', 'n', 'i'): 289 subScriptBits = kHanFlag; 290 break; 291 case packScript('H', 'a', 'n', 's'): 292 subScriptBits = kHanFlag | kSimplifiedChineseFlag; 293 break; 294 case packScript('H', 'a', 'n', 't'): 295 subScriptBits = kHanFlag | kTraditionalChineseFlag; 296 break; 297 case packScript('H', 'i', 'r', 'a'): 298 subScriptBits = kHiraganaFlag; 299 break; 300 case packScript('H', 'r', 'k', 't'): 301 subScriptBits = kKatakanaFlag | kHiraganaFlag; 302 break; 303 case packScript('J', 'p', 'a', 'n'): 304 subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag; 305 break; 306 case packScript('K', 'a', 'n', 'a'): 307 subScriptBits = kKatakanaFlag; 308 break; 309 case packScript('K', 'o', 'r', 'e'): 310 subScriptBits = kHanFlag | kHangulFlag; 311 break; 312 } 313 return subScriptBits; 314 } 315 getString() const316 std::string Locale::getString() const { 317 char buf[32]; 318 int i = buildLocaleString(buf); 319 return std::string(buf, i); 320 } 321 getStringWithLineBreakOption(LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle) const322 std::string Locale::getStringWithLineBreakOption(LineBreakStyle lbStyle, 323 LineBreakWordStyle lbWordStyle) const { 324 char buf[48]; 325 int i = buildLocaleString(buf); 326 327 // Add line break unicode extension. 328 if (lbStyle != LineBreakStyle::None || lbWordStyle != LineBreakWordStyle::None) { 329 buf[i++] = '-'; 330 buf[i++] = 'u'; 331 } 332 333 if (lbStyle != LineBreakStyle::None) { 334 buf[i++] = '-'; 335 buf[i++] = 'l'; 336 buf[i++] = 'b'; 337 buf[i++] = '-'; 338 switch (lbStyle) { 339 case LineBreakStyle::Loose: 340 buf[i++] = 'l'; 341 buf[i++] = 'o'; 342 buf[i++] = 'o'; 343 buf[i++] = 's'; 344 buf[i++] = 'e'; 345 break; 346 case LineBreakStyle::Normal: 347 buf[i++] = 'n'; 348 buf[i++] = 'o'; 349 buf[i++] = 'r'; 350 buf[i++] = 'm'; 351 buf[i++] = 'a'; 352 buf[i++] = 'l'; 353 break; 354 case LineBreakStyle::Strict: 355 buf[i++] = 's'; 356 buf[i++] = 't'; 357 buf[i++] = 'r'; 358 buf[i++] = 'i'; 359 buf[i++] = 'c'; 360 buf[i++] = 't'; 361 break; 362 default: 363 MINIKIN_ASSERT(false, "Must not reached."); 364 } 365 } 366 367 if (lbWordStyle != LineBreakWordStyle::None) { 368 buf[i++] = '-'; 369 buf[i++] = 'l'; 370 buf[i++] = 'w'; 371 buf[i++] = '-'; 372 switch (lbWordStyle) { 373 case LineBreakWordStyle::Phrase: 374 buf[i++] = 'p'; 375 buf[i++] = 'h'; 376 buf[i++] = 'r'; 377 buf[i++] = 'a'; 378 buf[i++] = 's'; 379 buf[i++] = 'e'; 380 break; 381 default: 382 MINIKIN_ASSERT(false, "Must not reached."); 383 } 384 } 385 return std::string(buf, i); 386 } 387 buildLocaleString(char * buf) const388 int Locale::buildLocaleString(char* buf) const { 389 size_t i; 390 if (mLanguage == NO_LANGUAGE) { 391 buf[0] = 'u'; 392 buf[1] = 'n'; 393 buf[2] = 'd'; 394 i = 3; 395 } else { 396 i = unpackLanguage(mLanguage, buf); 397 } 398 if (mScript != NO_SCRIPT) { 399 uint32_t rawScript = unpackScript(mScript); 400 buf[i++] = '-'; 401 buf[i++] = (rawScript >> 24) & 0xFFu; 402 buf[i++] = (rawScript >> 16) & 0xFFu; 403 buf[i++] = (rawScript >> 8) & 0xFFu; 404 buf[i++] = rawScript & 0xFFu; 405 } 406 if (mRegion != NO_REGION) { 407 buf[i++] = '-'; 408 i += unpackRegion(mRegion, buf + i); 409 } 410 if (mVariant != Variant::NO_VARIANT) { 411 buf[i++] = '-'; 412 buf[i++] = '1'; 413 buf[i++] = '9'; 414 switch (mVariant) { 415 case Variant::GERMAN_1901_ORTHOGRAPHY: 416 buf[i++] = '0'; 417 buf[i++] = '1'; 418 break; 419 case Variant::GERMAN_1996_ORTHOGRAPHY: 420 buf[i++] = '9'; 421 buf[i++] = '6'; 422 break; 423 default: 424 MINIKIN_ASSERT(false, "Must not reached."); 425 } 426 } 427 return i; 428 } 429 getPartialLocale(SubtagBits bits) const430 Locale Locale::getPartialLocale(SubtagBits bits) const { 431 Locale subLocale; 432 if ((bits & SubtagBits::LANGUAGE) != SubtagBits::EMPTY) { 433 subLocale.mLanguage = mLanguage; 434 } else { 435 subLocale.mLanguage = packLanguage("und"); 436 } 437 if ((bits & SubtagBits::SCRIPT) != SubtagBits::EMPTY) { 438 subLocale.mScript = mScript; 439 subLocale.mSubScriptBits = mSubScriptBits; 440 } 441 if ((bits & SubtagBits::REGION) != SubtagBits::EMPTY) { 442 subLocale.mRegion = mRegion; 443 } 444 if ((bits & SubtagBits::VARIANT) != SubtagBits::EMPTY) { 445 subLocale.mVariant = mVariant; 446 } 447 if ((bits & SubtagBits::EMOJI) != SubtagBits::EMPTY) { 448 subLocale.mEmojiStyle = mEmojiStyle; 449 } 450 return subLocale; 451 } 452 isEqualScript(const Locale & other) const453 bool Locale::isEqualScript(const Locale& other) const { 454 return other.mScript == mScript; 455 } 456 457 // static supportsScript(uint8_t providedBits,uint8_t requestedBits)458 bool Locale::supportsScript(uint8_t providedBits, uint8_t requestedBits) { 459 return requestedBits != 0 && (providedBits & requestedBits) == requestedBits; 460 } 461 supportsScript(uint32_t script) const462 bool Locale::supportsScript(uint32_t script) const { 463 static_assert(unpackScript(packScript('J', 'p', 'a', 'n')) == HB_TAG('J', 'p', 'a', 'n'), 464 "The Minikin script and HarfBuzz hb_script_t have different encodings."); 465 uint32_t packedScript = packScript(script); 466 if (packedScript == mScript) return true; 467 return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript)); 468 } 469 supportsScript(char c1,char c2,char c3,char c4) const470 bool Locale::supportsScript(char c1, char c2, char c3, char c4) const { 471 uint32_t packedScript = packScript(c1, c2, c3, c4); 472 if (packedScript == mScript) return true; 473 return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript)); 474 } 475 calcScoreFor(const LocaleList & supported) const476 int Locale::calcScoreFor(const LocaleList& supported) const { 477 bool languageScriptMatch = false; 478 bool subtagMatch = false; 479 bool scriptMatch = false; 480 481 for (size_t i = 0; i < supported.size(); ++i) { 482 if (mEmojiStyle != EmojiStyle::EMPTY && mEmojiStyle == supported[i].mEmojiStyle) { 483 subtagMatch = true; 484 if (mLanguage == supported[i].mLanguage) { 485 return 4; 486 } 487 } 488 if (isEqualScript(supported[i]) || 489 supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) { 490 scriptMatch = true; 491 if (mLanguage == supported[i].mLanguage) { 492 languageScriptMatch = true; 493 } 494 } 495 } 496 497 if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) { 498 scriptMatch = true; 499 if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLocale()) { 500 return 3; 501 } 502 } 503 504 if (languageScriptMatch) { 505 return 3; 506 } else if (subtagMatch) { 507 return 2; 508 } else if (scriptMatch) { 509 return 1; 510 } 511 return 0; 512 } 513 buildHbLanguage(const Locale & locale)514 static hb_language_t buildHbLanguage(const Locale& locale) { 515 return locale.isSupported() ? hb_language_from_string(locale.getString().c_str(), -1) 516 : HB_LANGUAGE_INVALID; 517 } 518 LocaleList(std::vector<Locale> && locales)519 LocaleList::LocaleList(std::vector<Locale>&& locales) : mLocales(std::move(locales)) { 520 mIsAllTheSameLocale = true; 521 mUnionOfSubScriptBits = 0u; 522 mHbLangs.reserve(mLocales.size()); 523 mEmojiStyle = EmojiStyle::EMPTY; 524 const auto firstLanguage = mLocales.empty() ? NO_LANGUAGE : mLocales[0].mLanguage; 525 for (const Locale& locale : mLocales) { 526 mUnionOfSubScriptBits |= locale.mSubScriptBits; 527 if (mIsAllTheSameLocale && firstLanguage != locale.mLanguage) { 528 mIsAllTheSameLocale = false; 529 } 530 mHbLangs.push_back(buildHbLanguage(locale)); 531 if (mEmojiStyle == EmojiStyle::EMPTY) { 532 mEmojiStyle = locale.getEmojiStyle(); 533 } 534 } 535 } 536 atLeastOneScriptMatch(const LocaleList & list) const537 bool LocaleList::atLeastOneScriptMatch(const LocaleList& list) const { 538 if ((mUnionOfSubScriptBits & list.mUnionOfSubScriptBits) != 0) { 539 return true; 540 } 541 542 for (const Locale& myLocale : mLocales) { 543 for (const Locale& otherLocale : list.mLocales) { 544 if (myLocale.isEqualScript(otherLocale)) { 545 return true; 546 } 547 } 548 } 549 550 return false; 551 } 552 hasScript(char c1,char c2,char c3,char c4) const553 bool LocaleList::hasScript(char c1, char c2, char c3, char c4) const { 554 for (const Locale& locale : mLocales) { 555 if (locale.supportsScript(c1, c2, c3, c4)) { 556 return true; 557 } 558 } 559 return false; 560 } 561 562 } // namespace minikin 563