xref: /aosp_15_r20/frameworks/minikin/libs/minikin/Locale.cpp (revision 834a2baab5fdfc28e9a428ee87c7ea8f6a06a53d)
1  /*
2   * Copyright (C) 2015 The Android Open Source Project
3   *
4   * Licensed under the Apache License, Version 2.0 (the "License");
5   * you may not use this file except in compliance with the License.
6   * You may obtain a copy of the License at
7   *
8   *      http://www.apache.org/licenses/LICENSE-2.0
9   *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  #include "Locale.h"
18  
19  #include <algorithm>
20  
21  #include <hb.h>
22  
23  #include "minikin/LocaleList.h"
24  
25  #include "LocaleListCache.h"
26  #include "MinikinInternal.h"
27  #include "StringPiece.h"
28  
29  namespace minikin {
30  
31  constexpr uint32_t FIVE_BITS = 0x1f;
32  
registerLocaleList(const std::string & locales)33  uint32_t registerLocaleList(const std::string& locales) {
34      return LocaleListCache::getId(locales);
35  }
36  
getLocaleString(uint32_t localeId)37  std::string getLocaleString(uint32_t localeId) {
38      const LocaleList& localeList = LocaleListCache::getById(localeId);
39      std::string out;
40      for (size_t i = 0; i < localeList.size(); ++i) {
41          if (i != 0) {
42              out += ",";
43          }
44          out += localeList[i].getString();
45      }
46      return out;
47  }
48  
49  // Check if a language code supports extension such as emoji and line break etc. according to its
50  // subtag
isSubtag(const char * buf,size_t bufLen,const char * subtag,size_t subtagLen)51  static bool isSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
52      if (bufLen < subtagLen) {
53          return false;
54      }
55      if (strncmp(buf, subtag, subtagLen) != 0) {
56          return false;  // no match between two strings
57      }
58      return (bufLen == subtagLen || buf[subtagLen] == '\0' || buf[subtagLen] == '-' ||
59              buf[subtagLen] == '_');
60  }
61  
62  // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
63  // For the region code, the letters must be all digits in three letter case, so the number of
64  // possible values are 10. For the language code, the letters must be all small alphabets, so the
65  // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
66  // three letter language code or region code to 15 bits.
67  //
68  // In case of two letter code, use fullbit(0x1f) for the first letter instead.
packLanguageOrRegion(const StringPiece & in,uint8_t twoLetterBase,uint8_t threeLetterBase)69  static uint16_t packLanguageOrRegion(const StringPiece& in, uint8_t twoLetterBase,
70                                       uint8_t threeLetterBase) {
71      if (in.length() == 2) {
72          return 0x7c00u |  // 0x1fu << 10
73                 (uint16_t)(in[0] - twoLetterBase) << 5 | (uint16_t)(in[1] - twoLetterBase);
74      } else {
75          return ((uint16_t)(in[0] - threeLetterBase) << 10) |
76                 (uint16_t)(in[1] - threeLetterBase) << 5 | (uint16_t)(in[2] - threeLetterBase);
77      }
78  }
79  
unpackLanguageOrRegion(uint16_t in,char * out,uint8_t twoLetterBase,uint8_t threeLetterBase)80  static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
81                                       uint8_t threeLetterBase) {
82      uint8_t first = (in >> 10) & FIVE_BITS;
83      uint8_t second = (in >> 5) & FIVE_BITS;
84      uint8_t third = in & FIVE_BITS;
85  
86      if (first == 0x1f) {
87          out[0] = second + twoLetterBase;
88          out[1] = third + twoLetterBase;
89          return 2;
90      } else {
91          out[0] = first + threeLetterBase;
92          out[1] = second + threeLetterBase;
93          out[2] = third + threeLetterBase;
94          return 3;
95      }
96  }
97  
packLanguage(const StringPiece & in)98  static uint16_t packLanguage(const StringPiece& in) {
99      return packLanguageOrRegion(in, 'a', 'a');
100  }
101  
unpackLanguage(uint16_t in,char * out)102  static size_t unpackLanguage(uint16_t in, char* out) {
103      return unpackLanguageOrRegion(in, out, 'a', 'a');
104  }
105  
packScript(char c1,char c2,char c3,char c4)106  constexpr uint32_t packScript(char c1, char c2, char c3, char c4) {
107      constexpr char FIRST_LETTER_BASE = 'A';
108      constexpr char REST_LETTER_BASE = 'a';
109      return ((uint32_t)(c1 - FIRST_LETTER_BASE) << 15) | (uint32_t)(c2 - REST_LETTER_BASE) << 10 |
110             ((uint32_t)(c3 - REST_LETTER_BASE) << 5) | (uint32_t)(c4 - REST_LETTER_BASE);
111  }
112  
packScript(uint32_t script)113  constexpr uint32_t packScript(uint32_t script) {
114      return packScript(script >> 24, (script >> 16) & 0xff, (script >> 8) & 0xff, script & 0xff);
115  }
116  
unpackScript(uint32_t packedScript)117  constexpr uint32_t unpackScript(uint32_t packedScript) {
118      constexpr char FIRST_LETTER_BASE = 'A';
119      constexpr char REST_LETTER_BASE = 'a';
120      const uint32_t first = (packedScript >> 15) + FIRST_LETTER_BASE;
121      const uint32_t second = ((packedScript >> 10) & FIVE_BITS) + REST_LETTER_BASE;
122      const uint32_t third = ((packedScript >> 5) & FIVE_BITS) + REST_LETTER_BASE;
123      const uint32_t fourth = (packedScript & FIVE_BITS) + REST_LETTER_BASE;
124  
125      return first << 24 | second << 16 | third << 8 | fourth;
126  }
127  
packRegion(const StringPiece & in)128  static uint16_t packRegion(const StringPiece& in) {
129      return packLanguageOrRegion(in, 'A', '0');
130  }
131  
unpackRegion(uint16_t in,char * out)132  static size_t unpackRegion(uint16_t in, char* out) {
133      return unpackLanguageOrRegion(in, out, 'A', '0');
134  }
135  
isLowercase(char c)136  static inline bool isLowercase(char c) {
137      return 'a' <= c && c <= 'z';
138  }
139  
isUppercase(char c)140  static inline bool isUppercase(char c) {
141      return 'A' <= c && c <= 'Z';
142  }
143  
isDigit(char c)144  static inline bool isDigit(char c) {
145      return '0' <= c && c <= '9';
146  }
147  
148  // Returns true if the buffer is valid for language code.
isValidLanguageCode(const StringPiece & buffer)149  static inline bool isValidLanguageCode(const StringPiece& buffer) {
150      if (buffer.length() != 2 && buffer.length() != 3) return false;
151      if (!isLowercase(buffer[0])) return false;
152      if (!isLowercase(buffer[1])) return false;
153      if (buffer.length() == 3 && !isLowercase(buffer[2])) return false;
154      return true;
155  }
156  
157  // Returns true if buffer is valid for script code. The length of buffer must be 4.
isValidScriptCode(const StringPiece & buffer)158  static inline bool isValidScriptCode(const StringPiece& buffer) {
159      return buffer.size() == 4 && isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
160             isLowercase(buffer[2]) && isLowercase(buffer[3]);
161  }
162  
163  // Returns true if the buffer is valid for region code.
isValidRegionCode(const StringPiece & buffer)164  static inline bool isValidRegionCode(const StringPiece& buffer) {
165      return (buffer.size() == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
166             (buffer.size() == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
167  }
168  
169  // Parse BCP 47 language identifier into internal structure
Locale(const StringPiece & input)170  Locale::Locale(const StringPiece& input) : Locale() {
171      SplitIterator it(input, '-');
172  
173      StringPiece language = it.next();
174      if (isValidLanguageCode(language)) {
175          mLanguage = packLanguage(language);
176      } else {
177          // We don't understand anything other than two-letter or three-letter
178          // language codes, so we skip parsing the rest of the string.
179          return;
180      }
181  
182      if (!it.hasNext()) {
183          return;  // Language code only.
184      }
185      StringPiece token = it.next();
186  
187      if (isValidScriptCode(token)) {
188          mScript = packScript(token[0], token[1], token[2], token[3]);
189          mSubScriptBits = scriptToSubScriptBits(mScript);
190  
191          if (!it.hasNext()) {
192              goto finalize;  // No variant, emoji subtag and region code.
193          }
194          token = it.next();
195      }
196  
197      if (isValidRegionCode(token)) {
198          mRegion = packRegion(token);
199  
200          if (!it.hasNext()) {
201              goto finalize;  // No variant or emoji subtag.
202          }
203          token = it.next();
204      }
205  
206      if (language == "de") {  // We are only interested in German variants.
207          if (token == "1901") {
208              mVariant = Variant::GERMAN_1901_ORTHOGRAPHY;
209          } else if (token == "1996") {
210              mVariant = Variant::GERMAN_1996_ORTHOGRAPHY;
211          }
212  
213          if (mVariant != Variant::NO_VARIANT) {
214              if (!it.hasNext()) {
215                  goto finalize;  // No emoji subtag.
216              }
217  
218              token = it.next();
219          }
220      }
221  
222      resolveUnicodeExtension(input.data(), input.length());
223  
224  finalize:
225      if (mEmojiStyle == EmojiStyle::EMPTY) {
226          mEmojiStyle = scriptToEmojiStyle(mScript);
227      }
228  }
229  
resolveUnicodeExtension(const char * buf,size_t length)230  void Locale::resolveUnicodeExtension(const char* buf, size_t length) {
231      static const char kPrefix[] = "-u-";
232      const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
233      if (pos != buf + length) {
234          pos += strlen(kPrefix);
235          const size_t remainingLength = length - (pos - buf);
236          mEmojiStyle = resolveEmojiStyle(pos, remainingLength);
237      }
238  }
239  
240  // static
241  // Lookup emoji subtag and determine the emoji style.
resolveEmojiStyle(const char * buf,size_t length)242  EmojiStyle Locale::resolveEmojiStyle(const char* buf, size_t length) {
243      // 7 is the length of "-u-em-text", which is the shortest emoji subtag,
244      // unnecessary comparison can be avoided if total length is smaller than 10.
245      const size_t kMinSubtagLength = 7;
246      if (length >= kMinSubtagLength) {
247          static const char kPrefix[] = "em-";
248          const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
249          if (pos != buf + length) {  // found
250              pos += strlen(kPrefix);
251              const size_t remainingLength = length - (pos - buf);
252              if (isSubtag(pos, remainingLength, "emoji", 5)) {
253                  return EmojiStyle::EMOJI;
254              } else if (isSubtag(pos, remainingLength, "text", 4)) {
255                  return EmojiStyle::TEXT;
256              } else if (isSubtag(pos, remainingLength, "default", 7)) {
257                  return EmojiStyle::DEFAULT;
258              }
259          }
260      }
261      return EmojiStyle::EMPTY;
262  }
263  
scriptToEmojiStyle(uint32_t script)264  EmojiStyle Locale::scriptToEmojiStyle(uint32_t script) {
265      // If no emoji subtag was provided, resolve the emoji style from script code.
266      if (script == packScript('Z', 's', 'y', 'e')) {
267          return EmojiStyle::EMOJI;
268      } else if (script == packScript('Z', 's', 'y', 'm')) {
269          return EmojiStyle::TEXT;
270      }
271      return EmojiStyle::EMPTY;
272  }
273  
274  // static
scriptToSubScriptBits(uint32_t script)275  uint8_t Locale::scriptToSubScriptBits(uint32_t script) {
276      uint8_t subScriptBits = 0u;
277      switch (script) {
278          case packScript('B', 'o', 'p', 'o'):
279              subScriptBits = kBopomofoFlag;
280              break;
281          case packScript('H', 'a', 'n', 'g'):
282              subScriptBits = kHangulFlag;
283              break;
284          case packScript('H', 'a', 'n', 'b'):
285              // Bopomofo is almost exclusively used in Taiwan.
286              subScriptBits = kHanFlag | kBopomofoFlag;
287              break;
288          case packScript('H', 'a', 'n', 'i'):
289              subScriptBits = kHanFlag;
290              break;
291          case packScript('H', 'a', 'n', 's'):
292              subScriptBits = kHanFlag | kSimplifiedChineseFlag;
293              break;
294          case packScript('H', 'a', 'n', 't'):
295              subScriptBits = kHanFlag | kTraditionalChineseFlag;
296              break;
297          case packScript('H', 'i', 'r', 'a'):
298              subScriptBits = kHiraganaFlag;
299              break;
300          case packScript('H', 'r', 'k', 't'):
301              subScriptBits = kKatakanaFlag | kHiraganaFlag;
302              break;
303          case packScript('J', 'p', 'a', 'n'):
304              subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
305              break;
306          case packScript('K', 'a', 'n', 'a'):
307              subScriptBits = kKatakanaFlag;
308              break;
309          case packScript('K', 'o', 'r', 'e'):
310              subScriptBits = kHanFlag | kHangulFlag;
311              break;
312      }
313      return subScriptBits;
314  }
315  
getString() const316  std::string Locale::getString() const {
317      char buf[32];
318      int i = buildLocaleString(buf);
319      return std::string(buf, i);
320  }
321  
getStringWithLineBreakOption(LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle) const322  std::string Locale::getStringWithLineBreakOption(LineBreakStyle lbStyle,
323                                                   LineBreakWordStyle lbWordStyle) const {
324      char buf[48];
325      int i = buildLocaleString(buf);
326  
327      // Add line break unicode extension.
328      if (lbStyle != LineBreakStyle::None || lbWordStyle != LineBreakWordStyle::None) {
329          buf[i++] = '-';
330          buf[i++] = 'u';
331      }
332  
333      if (lbStyle != LineBreakStyle::None) {
334          buf[i++] = '-';
335          buf[i++] = 'l';
336          buf[i++] = 'b';
337          buf[i++] = '-';
338          switch (lbStyle) {
339              case LineBreakStyle::Loose:
340                  buf[i++] = 'l';
341                  buf[i++] = 'o';
342                  buf[i++] = 'o';
343                  buf[i++] = 's';
344                  buf[i++] = 'e';
345                  break;
346              case LineBreakStyle::Normal:
347                  buf[i++] = 'n';
348                  buf[i++] = 'o';
349                  buf[i++] = 'r';
350                  buf[i++] = 'm';
351                  buf[i++] = 'a';
352                  buf[i++] = 'l';
353                  break;
354              case LineBreakStyle::Strict:
355                  buf[i++] = 's';
356                  buf[i++] = 't';
357                  buf[i++] = 'r';
358                  buf[i++] = 'i';
359                  buf[i++] = 'c';
360                  buf[i++] = 't';
361                  break;
362              default:
363                  MINIKIN_ASSERT(false, "Must not reached.");
364          }
365      }
366  
367      if (lbWordStyle != LineBreakWordStyle::None) {
368          buf[i++] = '-';
369          buf[i++] = 'l';
370          buf[i++] = 'w';
371          buf[i++] = '-';
372          switch (lbWordStyle) {
373              case LineBreakWordStyle::Phrase:
374                  buf[i++] = 'p';
375                  buf[i++] = 'h';
376                  buf[i++] = 'r';
377                  buf[i++] = 'a';
378                  buf[i++] = 's';
379                  buf[i++] = 'e';
380                  break;
381              default:
382                  MINIKIN_ASSERT(false, "Must not reached.");
383          }
384      }
385      return std::string(buf, i);
386  }
387  
buildLocaleString(char * buf) const388  int Locale::buildLocaleString(char* buf) const {
389      size_t i;
390      if (mLanguage == NO_LANGUAGE) {
391          buf[0] = 'u';
392          buf[1] = 'n';
393          buf[2] = 'd';
394          i = 3;
395      } else {
396          i = unpackLanguage(mLanguage, buf);
397      }
398      if (mScript != NO_SCRIPT) {
399          uint32_t rawScript = unpackScript(mScript);
400          buf[i++] = '-';
401          buf[i++] = (rawScript >> 24) & 0xFFu;
402          buf[i++] = (rawScript >> 16) & 0xFFu;
403          buf[i++] = (rawScript >> 8) & 0xFFu;
404          buf[i++] = rawScript & 0xFFu;
405      }
406      if (mRegion != NO_REGION) {
407          buf[i++] = '-';
408          i += unpackRegion(mRegion, buf + i);
409      }
410      if (mVariant != Variant::NO_VARIANT) {
411          buf[i++] = '-';
412          buf[i++] = '1';
413          buf[i++] = '9';
414          switch (mVariant) {
415              case Variant::GERMAN_1901_ORTHOGRAPHY:
416                  buf[i++] = '0';
417                  buf[i++] = '1';
418                  break;
419              case Variant::GERMAN_1996_ORTHOGRAPHY:
420                  buf[i++] = '9';
421                  buf[i++] = '6';
422                  break;
423              default:
424                  MINIKIN_ASSERT(false, "Must not reached.");
425          }
426      }
427      return i;
428  }
429  
getPartialLocale(SubtagBits bits) const430  Locale Locale::getPartialLocale(SubtagBits bits) const {
431      Locale subLocale;
432      if ((bits & SubtagBits::LANGUAGE) != SubtagBits::EMPTY) {
433          subLocale.mLanguage = mLanguage;
434      } else {
435          subLocale.mLanguage = packLanguage("und");
436      }
437      if ((bits & SubtagBits::SCRIPT) != SubtagBits::EMPTY) {
438          subLocale.mScript = mScript;
439          subLocale.mSubScriptBits = mSubScriptBits;
440      }
441      if ((bits & SubtagBits::REGION) != SubtagBits::EMPTY) {
442          subLocale.mRegion = mRegion;
443      }
444      if ((bits & SubtagBits::VARIANT) != SubtagBits::EMPTY) {
445          subLocale.mVariant = mVariant;
446      }
447      if ((bits & SubtagBits::EMOJI) != SubtagBits::EMPTY) {
448          subLocale.mEmojiStyle = mEmojiStyle;
449      }
450      return subLocale;
451  }
452  
isEqualScript(const Locale & other) const453  bool Locale::isEqualScript(const Locale& other) const {
454      return other.mScript == mScript;
455  }
456  
457  // static
supportsScript(uint8_t providedBits,uint8_t requestedBits)458  bool Locale::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
459      return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
460  }
461  
supportsScript(uint32_t script) const462  bool Locale::supportsScript(uint32_t script) const {
463      static_assert(unpackScript(packScript('J', 'p', 'a', 'n')) == HB_TAG('J', 'p', 'a', 'n'),
464                    "The Minikin script and HarfBuzz hb_script_t have different encodings.");
465      uint32_t packedScript = packScript(script);
466      if (packedScript == mScript) return true;
467      return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
468  }
469  
supportsScript(char c1,char c2,char c3,char c4) const470  bool Locale::supportsScript(char c1, char c2, char c3, char c4) const {
471      uint32_t packedScript = packScript(c1, c2, c3, c4);
472      if (packedScript == mScript) return true;
473      return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
474  }
475  
calcScoreFor(const LocaleList & supported) const476  int Locale::calcScoreFor(const LocaleList& supported) const {
477      bool languageScriptMatch = false;
478      bool subtagMatch = false;
479      bool scriptMatch = false;
480  
481      for (size_t i = 0; i < supported.size(); ++i) {
482          if (mEmojiStyle != EmojiStyle::EMPTY && mEmojiStyle == supported[i].mEmojiStyle) {
483              subtagMatch = true;
484              if (mLanguage == supported[i].mLanguage) {
485                  return 4;
486              }
487          }
488          if (isEqualScript(supported[i]) ||
489              supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
490              scriptMatch = true;
491              if (mLanguage == supported[i].mLanguage) {
492                  languageScriptMatch = true;
493              }
494          }
495      }
496  
497      if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
498          scriptMatch = true;
499          if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLocale()) {
500              return 3;
501          }
502      }
503  
504      if (languageScriptMatch) {
505          return 3;
506      } else if (subtagMatch) {
507          return 2;
508      } else if (scriptMatch) {
509          return 1;
510      }
511      return 0;
512  }
513  
buildHbLanguage(const Locale & locale)514  static hb_language_t buildHbLanguage(const Locale& locale) {
515      return locale.isSupported() ? hb_language_from_string(locale.getString().c_str(), -1)
516                                  : HB_LANGUAGE_INVALID;
517  }
518  
LocaleList(std::vector<Locale> && locales)519  LocaleList::LocaleList(std::vector<Locale>&& locales) : mLocales(std::move(locales)) {
520      mIsAllTheSameLocale = true;
521      mUnionOfSubScriptBits = 0u;
522      mHbLangs.reserve(mLocales.size());
523      mEmojiStyle = EmojiStyle::EMPTY;
524      const auto firstLanguage = mLocales.empty() ? NO_LANGUAGE : mLocales[0].mLanguage;
525      for (const Locale& locale : mLocales) {
526          mUnionOfSubScriptBits |= locale.mSubScriptBits;
527          if (mIsAllTheSameLocale && firstLanguage != locale.mLanguage) {
528              mIsAllTheSameLocale = false;
529          }
530          mHbLangs.push_back(buildHbLanguage(locale));
531          if (mEmojiStyle == EmojiStyle::EMPTY) {
532              mEmojiStyle = locale.getEmojiStyle();
533          }
534      }
535  }
536  
atLeastOneScriptMatch(const LocaleList & list) const537  bool LocaleList::atLeastOneScriptMatch(const LocaleList& list) const {
538      if ((mUnionOfSubScriptBits & list.mUnionOfSubScriptBits) != 0) {
539          return true;
540      }
541  
542      for (const Locale& myLocale : mLocales) {
543          for (const Locale& otherLocale : list.mLocales) {
544              if (myLocale.isEqualScript(otherLocale)) {
545                  return true;
546              }
547          }
548      }
549  
550      return false;
551  }
552  
hasScript(char c1,char c2,char c3,char c4) const553  bool LocaleList::hasScript(char c1, char c2, char c3, char c4) const {
554      for (const Locale& locale : mLocales) {
555          if (locale.supportsScript(c1, c2, c3, c4)) {
556              return true;
557          }
558      }
559      return false;
560  }
561  
562  }  // namespace minikin
563