xref: /aosp_15_r20/bionic/tests/iconv_test.cpp (revision 8d67ca893c1523eb926b9080dbe4e2ffd2a27ba1)
1  /*
2   * Copyright (C) 2017 The Android Open Source Project
3   *
4   * Licensed under the Apache License, Version 2.0 (the "License");
5   * you may not use this file except in compliance with the License.
6   * You may obtain a copy of the License at
7   *
8   *      http://www.apache.org/licenses/LICENSE-2.0
9   *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  #include <gtest/gtest.h>
18  
19  #include <iconv.h>
20  
21  #include "utils.h"
22  
23  #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
24  
TEST(iconv,iconv_open_EINVAL)25  TEST(iconv, iconv_open_EINVAL) {
26    errno = 0;
27    ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "silly"));
28    ASSERT_ERRNO(EINVAL);
29    errno = 0;
30    ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "UTF-8"));
31    ASSERT_ERRNO(EINVAL);
32    errno = 0;
33    ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "silly"));
34    ASSERT_ERRNO(EINVAL);
35  }
36  
TEST(iconv,iconv_open_comparator)37  TEST(iconv, iconv_open_comparator) {
38    // Examples from http://www.unicode.org/reports/tr22/#Charset_Alias_Matching:
39    // "For example, the following names should match: "UTF-8", "utf8", "u.t.f-008", ..."
40    iconv_t c;
41    ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "utf8"));
42    ASSERT_EQ(0, iconv_close(c));
43    ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "u.t.f-008"));
44    ASSERT_EQ(0, iconv_close(c));
45  
46    // "...but not "utf-80" or "ut8"."
47    errno = 0;
48    ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "utf-80"));
49    ASSERT_ERRNO(EINVAL);
50    errno = 0;
51    ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "ut80"));
52    ASSERT_ERRNO(EINVAL);
53  }
54  
TEST(iconv,iconv_smoke)55  TEST(iconv, iconv_smoke) {
56    const char* utf8 = "a٦ᄀ"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
57    char buf[BUFSIZ] = {};
58  
59    iconv_t c = iconv_open("UTF-32LE", "UTF-8");
60    ASSERT_NE(INVALID_ICONV_T, c);
61  
62    char* in = const_cast<char*>(utf8);
63    size_t in_bytes = strlen(in);
64  
65    char* out = buf;
66    size_t out_bytes = sizeof(buf);
67  
68    EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
69  
70    wchar_t* utf16 = reinterpret_cast<wchar_t*>(buf);
71    EXPECT_EQ(L'a', utf16[0]);
72    EXPECT_EQ(L'٦', utf16[1]);
73    EXPECT_EQ(L'ᄀ', utf16[2]);
74    EXPECT_EQ(L'\0', utf16[3]);
75    EXPECT_EQ(0U, in_bytes);
76    EXPECT_EQ(sizeof(buf) - (3 /* chars */ * 4 /* bytes each */), out_bytes);
77  
78    ASSERT_EQ(0, iconv_close(c));
79  }
80  
TEST(iconv,iconv_lossy_TRANSLIT)81  TEST(iconv, iconv_lossy_TRANSLIT) {
82    const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
83    char buf[BUFSIZ] = {};
84  
85    iconv_t c = iconv_open("ASCII//TRANSLIT", "UTF-8");
86    ASSERT_NE(INVALID_ICONV_T, c);
87  
88    char* in = const_cast<char*>(utf8);
89    size_t in_bytes = strlen(in);
90  
91    char* out = buf;
92    size_t out_bytes = sizeof(buf);
93  
94    // Two of the input characters (5 input bytes) aren't representable as ASCII.
95    // With "//TRANSLIT", we use a replacement character, and report the number
96    // of replacements.
97    EXPECT_EQ(2U, iconv(c, &in, &in_bytes, &out, &out_bytes));
98  
99    EXPECT_EQ('a', buf[0]);
100    EXPECT_EQ('?', buf[1]);
101    EXPECT_EQ('?', buf[2]);
102    EXPECT_EQ('z', buf[3]);
103    EXPECT_EQ(0, buf[4]);
104    EXPECT_EQ(0U, in_bytes);
105    EXPECT_EQ(sizeof(buf) - 4, out_bytes);
106  
107    ASSERT_EQ(0, iconv_close(c));
108  }
109  
TEST(iconv,iconv_lossy_IGNORE)110  TEST(iconv, iconv_lossy_IGNORE) {
111    const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
112    char buf[BUFSIZ] = {};
113  
114    iconv_t c = iconv_open("ASCII//IGNORE", "UTF-8");
115    ASSERT_NE(INVALID_ICONV_T, c);
116  
117    char* in = const_cast<char*>(utf8);
118    size_t in_bytes = strlen(in);
119  
120    char* out = buf;
121    size_t out_bytes = sizeof(buf);
122  
123    // Two of the input characters (5 input bytes) aren't representable as ASCII.
124    // With "//IGNORE", we just skip them (but return failure).
125    errno = 0;
126    EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
127    EXPECT_ERRNO(EILSEQ);
128  
129    EXPECT_EQ('a', buf[0]);
130    EXPECT_EQ('z', buf[1]);
131    EXPECT_EQ(0, buf[2]);
132    EXPECT_EQ(0U, in_bytes);
133    EXPECT_EQ(sizeof(buf) - 2, out_bytes);
134  
135    ASSERT_EQ(0, iconv_close(c));
136  }
137  
TEST(iconv,iconv_lossy)138  TEST(iconv, iconv_lossy) {
139    const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
140    char buf[BUFSIZ] = {};
141  
142    iconv_t c = iconv_open("ASCII", "UTF-8");
143    ASSERT_NE(INVALID_ICONV_T, c);
144  
145    char* in = const_cast<char*>(utf8);
146    size_t in_bytes = strlen(in);
147  
148    char* out = buf;
149    size_t out_bytes = sizeof(buf);
150  
151    // The second input character isn't representable as ASCII, so we stop there.
152    errno = 0;
153    EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
154    EXPECT_ERRNO(EILSEQ);
155  
156    EXPECT_EQ('a', buf[0]);
157    EXPECT_EQ(0, buf[1]);
158    EXPECT_EQ(6U, in_bytes); // Two bytes for ٦, three bytes for ᄀ, and one byte for z.
159    EXPECT_EQ(sizeof(buf) - 1, out_bytes);
160  
161    ASSERT_EQ(0, iconv_close(c));
162  }
163  
TEST(iconv,iconv_malformed_sequence_EILSEQ)164  TEST(iconv, iconv_malformed_sequence_EILSEQ) {
165    const char* utf8 = "a\xd9z"; // 0xd9 is the first byte of the two-byte U+0666 ٦.
166    char buf[BUFSIZ] = {};
167  
168    iconv_t c = iconv_open("UTF-8", "UTF-8");
169    ASSERT_NE(INVALID_ICONV_T, c);
170  
171    char* in = const_cast<char*>(utf8);
172    size_t in_bytes = strlen(in);
173  
174    char* out = buf;
175    size_t out_bytes = sizeof(buf);
176  
177    // The second input byte is a malformed character, so we stop there.
178    errno = 0;
179    EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
180    EXPECT_ERRNO(EILSEQ);
181    EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the invalid sequence.
182    ++in;
183    --in_bytes;
184    errno = 0;
185    EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
186    EXPECT_ERRNO(0);
187  
188    EXPECT_EQ('a', buf[0]);
189    EXPECT_EQ('z', buf[1]);
190    EXPECT_EQ(0, buf[2]);
191    EXPECT_EQ(0U, in_bytes);
192    EXPECT_EQ(sizeof(buf) - 2, out_bytes);
193  
194    ASSERT_EQ(0, iconv_close(c));
195  }
196  
TEST(iconv,iconv_incomplete_sequence_EINVAL)197  TEST(iconv, iconv_incomplete_sequence_EINVAL) {
198    const char* utf8 = "a\xd9"; // 0xd9 is the first byte of the two-byte U+0666 ٦.
199    char buf[BUFSIZ] = {};
200  
201    iconv_t c = iconv_open("UTF-8", "UTF-8");
202    ASSERT_NE(INVALID_ICONV_T, c);
203  
204    char* in = const_cast<char*>(utf8);
205    size_t in_bytes = strlen(in);
206  
207    char* out = buf;
208    size_t out_bytes = sizeof(buf);
209  
210    // The second input byte is just the start of a character, and we don't have any more bytes.
211    errno = 0;
212    EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
213    EXPECT_ERRNO(EINVAL);
214    EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the incomplete sequence.
215  
216    EXPECT_EQ('a', buf[0]);
217    EXPECT_EQ(0, buf[1]);
218    EXPECT_EQ(1U, in_bytes);
219    EXPECT_EQ(sizeof(buf) - 1, out_bytes);
220  
221    ASSERT_EQ(0, iconv_close(c));
222  }
223  
TEST(iconv,iconv_E2BIG)224  TEST(iconv, iconv_E2BIG) {
225    const char* utf8 = "abc";
226    char buf[BUFSIZ] = {};
227  
228    iconv_t c = iconv_open("UTF-8", "UTF-8");
229    ASSERT_NE(INVALID_ICONV_T, c);
230  
231    char* in = const_cast<char*>(utf8);
232    size_t in_bytes = strlen(in);
233  
234    char* out = buf;
235    size_t out_bytes = 1;
236  
237    // We need three bytes, so one isn't enough (but we will make progress).
238    out_bytes = 1;
239    errno = 0;
240    EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
241    EXPECT_ERRNO(E2BIG);
242    EXPECT_EQ(2U, in_bytes);
243    EXPECT_EQ(0U, out_bytes);
244  
245    // Two bytes left, so zero isn't enough (and we can't even make progress).
246    out_bytes = 0;
247    errno = 0;
248    EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
249    EXPECT_ERRNO(E2BIG);
250    EXPECT_EQ(2U, in_bytes);
251    EXPECT_EQ(0U, out_bytes);
252  
253    // Two bytes left, so one isn't enough (but we will make progress).
254    out_bytes = 1;
255    errno = 0;
256    EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
257    EXPECT_ERRNO(E2BIG);
258    EXPECT_EQ(1U, in_bytes);
259    EXPECT_EQ(0U, out_bytes);
260  
261    // One byte left, so one byte is now enough.
262    out_bytes = 1;
263    errno = 0;
264    EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
265    EXPECT_ERRNO(0);
266    EXPECT_EQ(0U, in_bytes);
267    EXPECT_EQ(0U, out_bytes);
268  
269    EXPECT_EQ('a', buf[0]);
270    EXPECT_EQ('b', buf[1]);
271    EXPECT_EQ('c', buf[2]);
272    EXPECT_EQ(0, buf[3]);
273  
274    ASSERT_EQ(0, iconv_close(c));
275  }
276  
TEST(iconv,iconv_invalid_converter_EBADF)277  TEST(iconv, iconv_invalid_converter_EBADF) {
278    char* in = nullptr;
279    char* out = nullptr;
280    size_t in_bytes = 0;
281    size_t out_bytes = 0;
282    errno = 0;
283    ASSERT_EQ(static_cast<size_t>(-1), iconv(INVALID_ICONV_T, &in, &in_bytes, &out, &out_bytes));
284    ASSERT_ERRNO(EBADF);
285  }
286  
TEST(iconv,iconv_close_invalid_converter_EBADF)287  TEST(iconv, iconv_close_invalid_converter_EBADF) {
288    errno = 0;
289    ASSERT_EQ(-1, iconv_close(INVALID_ICONV_T));
290    ASSERT_ERRNO(EBADF);
291  }
292  
RoundTrip(const char * dst_enc,const char * expected_bytes,size_t n)293  static void RoundTrip(const char* dst_enc, const char* expected_bytes, size_t n) {
294    // Examples from https://en.wikipedia.org/wiki/UTF-16.
295    const char* utf8 = "$€��"; // U+0024, U+20AC, U+10437.
296  
297    iconv_t c = iconv_open(dst_enc, "UTF-8");
298    ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
299  
300    char* in = const_cast<char*>(utf8);
301    size_t in_bytes = strlen(utf8);
302    char buf[BUFSIZ] = {};
303    char* out = buf;
304    size_t out_bytes = sizeof(buf);
305    size_t replacement_count = iconv(c, &in, &in_bytes, &out, &out_bytes);
306  
307    // Check we got the bytes we were expecting.
308    for (size_t i = 0; i < n; ++i) {
309      EXPECT_EQ(expected_bytes[i], buf[i]) << i << ' '<< dst_enc;
310    }
311  
312    ASSERT_EQ(0, iconv_close(c));
313  
314    // We can't round-trip if there were replacements.
315    if (strstr(dst_enc, "ascii")) {
316      GTEST_LOG_(INFO) << "can't round-trip " << dst_enc << "\n";
317      return;
318    }
319    ASSERT_EQ(0U, replacement_count);
320  
321    c = iconv_open("UTF-8", dst_enc);
322    ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
323  
324    in = buf;
325    in_bytes = n;
326    char buf2[BUFSIZ] = {};
327    out = buf2;
328    out_bytes = sizeof(buf2);
329    iconv(c, &in, &in_bytes, &out, &out_bytes);
330  
331    ASSERT_STREQ(utf8, buf2) << dst_enc;
332  
333    ASSERT_EQ(0, iconv_close(c));
334  }
335  
TEST(iconv,iconv_round_trip_ascii)336  TEST(iconv, iconv_round_trip_ascii) {
337    RoundTrip("ascii//TRANSLIT", "$??", 3);
338  }
339  
TEST(iconv,iconv_round_trip_utf8)340  TEST(iconv, iconv_round_trip_utf8) {
341    RoundTrip("utf8", "\x24\xe2\x82\xac\xf0\x90\x90\xb7", 8);
342  }
343  
TEST(iconv,iconv_round_trip_utf16be)344  TEST(iconv, iconv_round_trip_utf16be) {
345    RoundTrip("utf16be", "\x00\x24" "\x20\xac" "\xd8\x01\xdc\x37", 8);
346  }
347  
TEST(iconv,iconv_round_trip_utf16le)348  TEST(iconv, iconv_round_trip_utf16le) {
349    RoundTrip("utf16le", "\x24\x00" "\xac\x20" "\x01\xd8\x37\xdc", 8);
350  }
351  
TEST(iconv,iconv_round_trip_utf32be)352  TEST(iconv, iconv_round_trip_utf32be) {
353    RoundTrip("utf32be", "\x00\x00\x00\x24" "\x00\x00\x20\xac" "\x00\x01\x04\x37", 12);
354  }
355  
TEST(iconv,iconv_round_trip_utf32le)356  TEST(iconv, iconv_round_trip_utf32le) {
357    RoundTrip("utf32le", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
358  }
359  
TEST(iconv,iconv_round_trip_wchar_t)360  TEST(iconv, iconv_round_trip_wchar_t) {
361    RoundTrip("wchar_t", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
362  }
363  
Check(int expected_errno,const char * src_enc,const char * src,size_t n)364  static void Check(int expected_errno, const char* src_enc, const char* src, size_t n) {
365    iconv_t c = iconv_open("wchar_t", src_enc);
366    char* in = const_cast<char*>(src);
367    size_t in_bytes = n;
368    wchar_t out_buf[16];
369    size_t out_bytes = sizeof(out_buf);
370    char* out = reinterpret_cast<char*>(out_buf);
371    errno = 0;
372    ASSERT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
373    EXPECT_ERRNO(expected_errno);
374    EXPECT_EQ(0, iconv_close(c));
375  }
376  
TEST(iconv,iconv_EILSEQ_ascii)377  TEST(iconv, iconv_EILSEQ_ascii) {
378    Check(EILSEQ, "ASCII", "\xac", 1); // > 0x7f, so not ASCII.
379  }
380  
TEST(iconv,iconv_EILSEQ_utf8_initial)381  TEST(iconv, iconv_EILSEQ_utf8_initial) {
382    Check(EILSEQ, "utf8", "\x82", 1); // Invalid initial byte.
383  }
384  
TEST(iconv,iconv_EILSEQ_utf8_non_initial)385  TEST(iconv, iconv_EILSEQ_utf8_non_initial) {
386    Check(EILSEQ, "utf8", "\xe2\xe2\x82", 3); // Invalid second byte.
387  }
388  
TEST(iconv,iconv_EILSEQ_utf16be_low_surrogate_first)389  TEST(iconv, iconv_EILSEQ_utf16be_low_surrogate_first) {
390    Check(EILSEQ, "utf16be", "\xdc\x37" "\xd8\x01", 4);
391  }
392  
TEST(iconv,iconv_EILSEQ_utf16le_low_surrogate_first)393  TEST(iconv, iconv_EILSEQ_utf16le_low_surrogate_first) {
394    Check(EILSEQ, "utf16le", "\x37\xdc" "\x01\xd8", 4);
395  }
396  
TEST(iconv,iconv_EINVAL_utf8_short)397  TEST(iconv, iconv_EINVAL_utf8_short) {
398    Check(EINVAL, "utf8", "\xe2\x82", 2); // Missing final byte of 3-byte sequence.
399  }
400  
TEST(iconv,iconv_EINVAL_utf16be_short)401  TEST(iconv, iconv_EINVAL_utf16be_short) {
402    Check(EINVAL, "utf16be", "\x00", 1); // Missing second byte.
403  }
404  
TEST(iconv,iconv_EINVAL_utf16be_missing_low_surrogate)405  TEST(iconv, iconv_EINVAL_utf16be_missing_low_surrogate) {
406    Check(EINVAL, "utf16be", "\xd8\x01", 2);
407  }
408  
TEST(iconv,iconv_EINVAL_utf16be_half_low_surrogate)409  TEST(iconv, iconv_EINVAL_utf16be_half_low_surrogate) {
410    Check(EINVAL, "utf16be", "\xd8\x01\xdc", 3);
411  }
412  
TEST(iconv,iconv_EINVAL_utf16le_short)413  TEST(iconv, iconv_EINVAL_utf16le_short) {
414    Check(EINVAL, "utf16le", "\x24", 1); // Missing second byte.
415  }
416  
TEST(iconv,iconv_EINVAL_utf16le_missing_low_surrogate)417  TEST(iconv, iconv_EINVAL_utf16le_missing_low_surrogate) {
418    Check(EINVAL, "utf16le", "\x01\xd8", 2);
419  }
420  
TEST(iconv,iconv_EINVAL_utf16le_half_low_surrogate)421  TEST(iconv, iconv_EINVAL_utf16le_half_low_surrogate) {
422    Check(EINVAL, "utf16le", "\x01\xd8\x37", 3);
423  }
424  
TEST(iconv,iconv_EINVAL_utf32be_short)425  TEST(iconv, iconv_EINVAL_utf32be_short) {
426    Check(EINVAL, "utf32be", "\x00\x00\x00", 3); // Missing final byte.
427  }
428  
TEST(iconv,iconv_EINVAL_utf32le_short)429  TEST(iconv, iconv_EINVAL_utf32le_short) {
430    Check(EINVAL, "utf32le", "\x24\x00\x00", 3); // Missing final byte.
431  }
432  
TEST(iconv,iconv_initial_shift_state)433  TEST(iconv, iconv_initial_shift_state) {
434    // POSIX: "For state-dependent encodings, the conversion descriptor
435    // cd is placed into its initial shift state by a call for which inbuf
436    // is a null pointer, or for which inbuf points to a null pointer."
437    iconv_t c = iconv_open("utf8", "utf8");
438    char* in = nullptr;
439    size_t in_bytes = 0;
440    wchar_t out_buf[16];
441    size_t out_bytes = sizeof(out_buf);
442    char* out = reinterpret_cast<char*>(out_buf);
443  
444    // Points to a null pointer...
445    errno = 0;
446    ASSERT_EQ(static_cast<size_t>(0), iconv(c, &in, &in_bytes, &out, &out_bytes));
447    EXPECT_ERRNO(0);
448    EXPECT_EQ(sizeof(out_buf), out_bytes);
449  
450    // Is a null pointer...
451    errno = 0;
452    ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, &in_bytes, &out, &out_bytes));
453    EXPECT_ERRNO(0);
454    EXPECT_EQ(sizeof(out_buf), out_bytes);
455  
456    // Is a null pointer and so is in_bytes. This isn't specified by POSIX, but
457    // glibc and macOS both allow that, where Android historically didn't.
458    // https://issuetracker.google.com/180598400
459    errno = 0;
460    ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, nullptr, &out, &out_bytes));
461    EXPECT_ERRNO(0);
462    EXPECT_EQ(sizeof(out_buf), out_bytes);
463  
464    EXPECT_EQ(0, iconv_close(c));
465  }
466