1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <gtest/gtest.h> 18 19 #include <iconv.h> 20 21 #include "utils.h" 22 23 #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1) 24 TEST(iconv,iconv_open_EINVAL)25 TEST(iconv, iconv_open_EINVAL) { 26 errno = 0; 27 ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "silly")); 28 ASSERT_ERRNO(EINVAL); 29 errno = 0; 30 ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "UTF-8")); 31 ASSERT_ERRNO(EINVAL); 32 errno = 0; 33 ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "silly")); 34 ASSERT_ERRNO(EINVAL); 35 } 36 TEST(iconv,iconv_open_comparator)37 TEST(iconv, iconv_open_comparator) { 38 // Examples from http://www.unicode.org/reports/tr22/#Charset_Alias_Matching: 39 // "For example, the following names should match: "UTF-8", "utf8", "u.t.f-008", ..." 40 iconv_t c; 41 ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "utf8")); 42 ASSERT_EQ(0, iconv_close(c)); 43 ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "u.t.f-008")); 44 ASSERT_EQ(0, iconv_close(c)); 45 46 // "...but not "utf-80" or "ut8"." 47 errno = 0; 48 ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "utf-80")); 49 ASSERT_ERRNO(EINVAL); 50 errno = 0; 51 ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "ut80")); 52 ASSERT_ERRNO(EINVAL); 53 } 54 TEST(iconv,iconv_smoke)55 TEST(iconv, iconv_smoke) { 56 const char* utf8 = "a٦ᄀ"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80 57 char buf[BUFSIZ] = {}; 58 59 iconv_t c = iconv_open("UTF-32LE", "UTF-8"); 60 ASSERT_NE(INVALID_ICONV_T, c); 61 62 char* in = const_cast<char*>(utf8); 63 size_t in_bytes = strlen(in); 64 65 char* out = buf; 66 size_t out_bytes = sizeof(buf); 67 68 EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes)); 69 70 wchar_t* utf16 = reinterpret_cast<wchar_t*>(buf); 71 EXPECT_EQ(L'a', utf16[0]); 72 EXPECT_EQ(L'٦', utf16[1]); 73 EXPECT_EQ(L'ᄀ', utf16[2]); 74 EXPECT_EQ(L'\0', utf16[3]); 75 EXPECT_EQ(0U, in_bytes); 76 EXPECT_EQ(sizeof(buf) - (3 /* chars */ * 4 /* bytes each */), out_bytes); 77 78 ASSERT_EQ(0, iconv_close(c)); 79 } 80 TEST(iconv,iconv_lossy_TRANSLIT)81 TEST(iconv, iconv_lossy_TRANSLIT) { 82 const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80 83 char buf[BUFSIZ] = {}; 84 85 iconv_t c = iconv_open("ASCII//TRANSLIT", "UTF-8"); 86 ASSERT_NE(INVALID_ICONV_T, c); 87 88 char* in = const_cast<char*>(utf8); 89 size_t in_bytes = strlen(in); 90 91 char* out = buf; 92 size_t out_bytes = sizeof(buf); 93 94 // Two of the input characters (5 input bytes) aren't representable as ASCII. 95 // With "//TRANSLIT", we use a replacement character, and report the number 96 // of replacements. 97 EXPECT_EQ(2U, iconv(c, &in, &in_bytes, &out, &out_bytes)); 98 99 EXPECT_EQ('a', buf[0]); 100 EXPECT_EQ('?', buf[1]); 101 EXPECT_EQ('?', buf[2]); 102 EXPECT_EQ('z', buf[3]); 103 EXPECT_EQ(0, buf[4]); 104 EXPECT_EQ(0U, in_bytes); 105 EXPECT_EQ(sizeof(buf) - 4, out_bytes); 106 107 ASSERT_EQ(0, iconv_close(c)); 108 } 109 TEST(iconv,iconv_lossy_IGNORE)110 TEST(iconv, iconv_lossy_IGNORE) { 111 const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80 112 char buf[BUFSIZ] = {}; 113 114 iconv_t c = iconv_open("ASCII//IGNORE", "UTF-8"); 115 ASSERT_NE(INVALID_ICONV_T, c); 116 117 char* in = const_cast<char*>(utf8); 118 size_t in_bytes = strlen(in); 119 120 char* out = buf; 121 size_t out_bytes = sizeof(buf); 122 123 // Two of the input characters (5 input bytes) aren't representable as ASCII. 124 // With "//IGNORE", we just skip them (but return failure). 125 errno = 0; 126 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 127 EXPECT_ERRNO(EILSEQ); 128 129 EXPECT_EQ('a', buf[0]); 130 EXPECT_EQ('z', buf[1]); 131 EXPECT_EQ(0, buf[2]); 132 EXPECT_EQ(0U, in_bytes); 133 EXPECT_EQ(sizeof(buf) - 2, out_bytes); 134 135 ASSERT_EQ(0, iconv_close(c)); 136 } 137 TEST(iconv,iconv_lossy)138 TEST(iconv, iconv_lossy) { 139 const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80 140 char buf[BUFSIZ] = {}; 141 142 iconv_t c = iconv_open("ASCII", "UTF-8"); 143 ASSERT_NE(INVALID_ICONV_T, c); 144 145 char* in = const_cast<char*>(utf8); 146 size_t in_bytes = strlen(in); 147 148 char* out = buf; 149 size_t out_bytes = sizeof(buf); 150 151 // The second input character isn't representable as ASCII, so we stop there. 152 errno = 0; 153 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 154 EXPECT_ERRNO(EILSEQ); 155 156 EXPECT_EQ('a', buf[0]); 157 EXPECT_EQ(0, buf[1]); 158 EXPECT_EQ(6U, in_bytes); // Two bytes for ٦, three bytes for ᄀ, and one byte for z. 159 EXPECT_EQ(sizeof(buf) - 1, out_bytes); 160 161 ASSERT_EQ(0, iconv_close(c)); 162 } 163 TEST(iconv,iconv_malformed_sequence_EILSEQ)164 TEST(iconv, iconv_malformed_sequence_EILSEQ) { 165 const char* utf8 = "a\xd9z"; // 0xd9 is the first byte of the two-byte U+0666 ٦. 166 char buf[BUFSIZ] = {}; 167 168 iconv_t c = iconv_open("UTF-8", "UTF-8"); 169 ASSERT_NE(INVALID_ICONV_T, c); 170 171 char* in = const_cast<char*>(utf8); 172 size_t in_bytes = strlen(in); 173 174 char* out = buf; 175 size_t out_bytes = sizeof(buf); 176 177 // The second input byte is a malformed character, so we stop there. 178 errno = 0; 179 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 180 EXPECT_ERRNO(EILSEQ); 181 EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the invalid sequence. 182 ++in; 183 --in_bytes; 184 errno = 0; 185 EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes)); 186 EXPECT_ERRNO(0); 187 188 EXPECT_EQ('a', buf[0]); 189 EXPECT_EQ('z', buf[1]); 190 EXPECT_EQ(0, buf[2]); 191 EXPECT_EQ(0U, in_bytes); 192 EXPECT_EQ(sizeof(buf) - 2, out_bytes); 193 194 ASSERT_EQ(0, iconv_close(c)); 195 } 196 TEST(iconv,iconv_incomplete_sequence_EINVAL)197 TEST(iconv, iconv_incomplete_sequence_EINVAL) { 198 const char* utf8 = "a\xd9"; // 0xd9 is the first byte of the two-byte U+0666 ٦. 199 char buf[BUFSIZ] = {}; 200 201 iconv_t c = iconv_open("UTF-8", "UTF-8"); 202 ASSERT_NE(INVALID_ICONV_T, c); 203 204 char* in = const_cast<char*>(utf8); 205 size_t in_bytes = strlen(in); 206 207 char* out = buf; 208 size_t out_bytes = sizeof(buf); 209 210 // The second input byte is just the start of a character, and we don't have any more bytes. 211 errno = 0; 212 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 213 EXPECT_ERRNO(EINVAL); 214 EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the incomplete sequence. 215 216 EXPECT_EQ('a', buf[0]); 217 EXPECT_EQ(0, buf[1]); 218 EXPECT_EQ(1U, in_bytes); 219 EXPECT_EQ(sizeof(buf) - 1, out_bytes); 220 221 ASSERT_EQ(0, iconv_close(c)); 222 } 223 TEST(iconv,iconv_E2BIG)224 TEST(iconv, iconv_E2BIG) { 225 const char* utf8 = "abc"; 226 char buf[BUFSIZ] = {}; 227 228 iconv_t c = iconv_open("UTF-8", "UTF-8"); 229 ASSERT_NE(INVALID_ICONV_T, c); 230 231 char* in = const_cast<char*>(utf8); 232 size_t in_bytes = strlen(in); 233 234 char* out = buf; 235 size_t out_bytes = 1; 236 237 // We need three bytes, so one isn't enough (but we will make progress). 238 out_bytes = 1; 239 errno = 0; 240 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 241 EXPECT_ERRNO(E2BIG); 242 EXPECT_EQ(2U, in_bytes); 243 EXPECT_EQ(0U, out_bytes); 244 245 // Two bytes left, so zero isn't enough (and we can't even make progress). 246 out_bytes = 0; 247 errno = 0; 248 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 249 EXPECT_ERRNO(E2BIG); 250 EXPECT_EQ(2U, in_bytes); 251 EXPECT_EQ(0U, out_bytes); 252 253 // Two bytes left, so one isn't enough (but we will make progress). 254 out_bytes = 1; 255 errno = 0; 256 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 257 EXPECT_ERRNO(E2BIG); 258 EXPECT_EQ(1U, in_bytes); 259 EXPECT_EQ(0U, out_bytes); 260 261 // One byte left, so one byte is now enough. 262 out_bytes = 1; 263 errno = 0; 264 EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes)); 265 EXPECT_ERRNO(0); 266 EXPECT_EQ(0U, in_bytes); 267 EXPECT_EQ(0U, out_bytes); 268 269 EXPECT_EQ('a', buf[0]); 270 EXPECT_EQ('b', buf[1]); 271 EXPECT_EQ('c', buf[2]); 272 EXPECT_EQ(0, buf[3]); 273 274 ASSERT_EQ(0, iconv_close(c)); 275 } 276 TEST(iconv,iconv_invalid_converter_EBADF)277 TEST(iconv, iconv_invalid_converter_EBADF) { 278 char* in = nullptr; 279 char* out = nullptr; 280 size_t in_bytes = 0; 281 size_t out_bytes = 0; 282 errno = 0; 283 ASSERT_EQ(static_cast<size_t>(-1), iconv(INVALID_ICONV_T, &in, &in_bytes, &out, &out_bytes)); 284 ASSERT_ERRNO(EBADF); 285 } 286 TEST(iconv,iconv_close_invalid_converter_EBADF)287 TEST(iconv, iconv_close_invalid_converter_EBADF) { 288 errno = 0; 289 ASSERT_EQ(-1, iconv_close(INVALID_ICONV_T)); 290 ASSERT_ERRNO(EBADF); 291 } 292 RoundTrip(const char * dst_enc,const char * expected_bytes,size_t n)293 static void RoundTrip(const char* dst_enc, const char* expected_bytes, size_t n) { 294 // Examples from https://en.wikipedia.org/wiki/UTF-16. 295 const char* utf8 = "$€��"; // U+0024, U+20AC, U+10437. 296 297 iconv_t c = iconv_open(dst_enc, "UTF-8"); 298 ASSERT_NE(INVALID_ICONV_T, c) << dst_enc; 299 300 char* in = const_cast<char*>(utf8); 301 size_t in_bytes = strlen(utf8); 302 char buf[BUFSIZ] = {}; 303 char* out = buf; 304 size_t out_bytes = sizeof(buf); 305 size_t replacement_count = iconv(c, &in, &in_bytes, &out, &out_bytes); 306 307 // Check we got the bytes we were expecting. 308 for (size_t i = 0; i < n; ++i) { 309 EXPECT_EQ(expected_bytes[i], buf[i]) << i << ' '<< dst_enc; 310 } 311 312 ASSERT_EQ(0, iconv_close(c)); 313 314 // We can't round-trip if there were replacements. 315 if (strstr(dst_enc, "ascii")) { 316 GTEST_LOG_(INFO) << "can't round-trip " << dst_enc << "\n"; 317 return; 318 } 319 ASSERT_EQ(0U, replacement_count); 320 321 c = iconv_open("UTF-8", dst_enc); 322 ASSERT_NE(INVALID_ICONV_T, c) << dst_enc; 323 324 in = buf; 325 in_bytes = n; 326 char buf2[BUFSIZ] = {}; 327 out = buf2; 328 out_bytes = sizeof(buf2); 329 iconv(c, &in, &in_bytes, &out, &out_bytes); 330 331 ASSERT_STREQ(utf8, buf2) << dst_enc; 332 333 ASSERT_EQ(0, iconv_close(c)); 334 } 335 TEST(iconv,iconv_round_trip_ascii)336 TEST(iconv, iconv_round_trip_ascii) { 337 RoundTrip("ascii//TRANSLIT", "$??", 3); 338 } 339 TEST(iconv,iconv_round_trip_utf8)340 TEST(iconv, iconv_round_trip_utf8) { 341 RoundTrip("utf8", "\x24\xe2\x82\xac\xf0\x90\x90\xb7", 8); 342 } 343 TEST(iconv,iconv_round_trip_utf16be)344 TEST(iconv, iconv_round_trip_utf16be) { 345 RoundTrip("utf16be", "\x00\x24" "\x20\xac" "\xd8\x01\xdc\x37", 8); 346 } 347 TEST(iconv,iconv_round_trip_utf16le)348 TEST(iconv, iconv_round_trip_utf16le) { 349 RoundTrip("utf16le", "\x24\x00" "\xac\x20" "\x01\xd8\x37\xdc", 8); 350 } 351 TEST(iconv,iconv_round_trip_utf32be)352 TEST(iconv, iconv_round_trip_utf32be) { 353 RoundTrip("utf32be", "\x00\x00\x00\x24" "\x00\x00\x20\xac" "\x00\x01\x04\x37", 12); 354 } 355 TEST(iconv,iconv_round_trip_utf32le)356 TEST(iconv, iconv_round_trip_utf32le) { 357 RoundTrip("utf32le", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12); 358 } 359 TEST(iconv,iconv_round_trip_wchar_t)360 TEST(iconv, iconv_round_trip_wchar_t) { 361 RoundTrip("wchar_t", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12); 362 } 363 Check(int expected_errno,const char * src_enc,const char * src,size_t n)364 static void Check(int expected_errno, const char* src_enc, const char* src, size_t n) { 365 iconv_t c = iconv_open("wchar_t", src_enc); 366 char* in = const_cast<char*>(src); 367 size_t in_bytes = n; 368 wchar_t out_buf[16]; 369 size_t out_bytes = sizeof(out_buf); 370 char* out = reinterpret_cast<char*>(out_buf); 371 errno = 0; 372 ASSERT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 373 EXPECT_ERRNO(expected_errno); 374 EXPECT_EQ(0, iconv_close(c)); 375 } 376 TEST(iconv,iconv_EILSEQ_ascii)377 TEST(iconv, iconv_EILSEQ_ascii) { 378 Check(EILSEQ, "ASCII", "\xac", 1); // > 0x7f, so not ASCII. 379 } 380 TEST(iconv,iconv_EILSEQ_utf8_initial)381 TEST(iconv, iconv_EILSEQ_utf8_initial) { 382 Check(EILSEQ, "utf8", "\x82", 1); // Invalid initial byte. 383 } 384 TEST(iconv,iconv_EILSEQ_utf8_non_initial)385 TEST(iconv, iconv_EILSEQ_utf8_non_initial) { 386 Check(EILSEQ, "utf8", "\xe2\xe2\x82", 3); // Invalid second byte. 387 } 388 TEST(iconv,iconv_EILSEQ_utf16be_low_surrogate_first)389 TEST(iconv, iconv_EILSEQ_utf16be_low_surrogate_first) { 390 Check(EILSEQ, "utf16be", "\xdc\x37" "\xd8\x01", 4); 391 } 392 TEST(iconv,iconv_EILSEQ_utf16le_low_surrogate_first)393 TEST(iconv, iconv_EILSEQ_utf16le_low_surrogate_first) { 394 Check(EILSEQ, "utf16le", "\x37\xdc" "\x01\xd8", 4); 395 } 396 TEST(iconv,iconv_EINVAL_utf8_short)397 TEST(iconv, iconv_EINVAL_utf8_short) { 398 Check(EINVAL, "utf8", "\xe2\x82", 2); // Missing final byte of 3-byte sequence. 399 } 400 TEST(iconv,iconv_EINVAL_utf16be_short)401 TEST(iconv, iconv_EINVAL_utf16be_short) { 402 Check(EINVAL, "utf16be", "\x00", 1); // Missing second byte. 403 } 404 TEST(iconv,iconv_EINVAL_utf16be_missing_low_surrogate)405 TEST(iconv, iconv_EINVAL_utf16be_missing_low_surrogate) { 406 Check(EINVAL, "utf16be", "\xd8\x01", 2); 407 } 408 TEST(iconv,iconv_EINVAL_utf16be_half_low_surrogate)409 TEST(iconv, iconv_EINVAL_utf16be_half_low_surrogate) { 410 Check(EINVAL, "utf16be", "\xd8\x01\xdc", 3); 411 } 412 TEST(iconv,iconv_EINVAL_utf16le_short)413 TEST(iconv, iconv_EINVAL_utf16le_short) { 414 Check(EINVAL, "utf16le", "\x24", 1); // Missing second byte. 415 } 416 TEST(iconv,iconv_EINVAL_utf16le_missing_low_surrogate)417 TEST(iconv, iconv_EINVAL_utf16le_missing_low_surrogate) { 418 Check(EINVAL, "utf16le", "\x01\xd8", 2); 419 } 420 TEST(iconv,iconv_EINVAL_utf16le_half_low_surrogate)421 TEST(iconv, iconv_EINVAL_utf16le_half_low_surrogate) { 422 Check(EINVAL, "utf16le", "\x01\xd8\x37", 3); 423 } 424 TEST(iconv,iconv_EINVAL_utf32be_short)425 TEST(iconv, iconv_EINVAL_utf32be_short) { 426 Check(EINVAL, "utf32be", "\x00\x00\x00", 3); // Missing final byte. 427 } 428 TEST(iconv,iconv_EINVAL_utf32le_short)429 TEST(iconv, iconv_EINVAL_utf32le_short) { 430 Check(EINVAL, "utf32le", "\x24\x00\x00", 3); // Missing final byte. 431 } 432 TEST(iconv,iconv_initial_shift_state)433 TEST(iconv, iconv_initial_shift_state) { 434 // POSIX: "For state-dependent encodings, the conversion descriptor 435 // cd is placed into its initial shift state by a call for which inbuf 436 // is a null pointer, or for which inbuf points to a null pointer." 437 iconv_t c = iconv_open("utf8", "utf8"); 438 char* in = nullptr; 439 size_t in_bytes = 0; 440 wchar_t out_buf[16]; 441 size_t out_bytes = sizeof(out_buf); 442 char* out = reinterpret_cast<char*>(out_buf); 443 444 // Points to a null pointer... 445 errno = 0; 446 ASSERT_EQ(static_cast<size_t>(0), iconv(c, &in, &in_bytes, &out, &out_bytes)); 447 EXPECT_ERRNO(0); 448 EXPECT_EQ(sizeof(out_buf), out_bytes); 449 450 // Is a null pointer... 451 errno = 0; 452 ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, &in_bytes, &out, &out_bytes)); 453 EXPECT_ERRNO(0); 454 EXPECT_EQ(sizeof(out_buf), out_bytes); 455 456 // Is a null pointer and so is in_bytes. This isn't specified by POSIX, but 457 // glibc and macOS both allow that, where Android historically didn't. 458 // https://issuetracker.google.com/180598400 459 errno = 0; 460 ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, nullptr, &out, &out_bytes)); 461 EXPECT_ERRNO(0); 462 EXPECT_EQ(sizeof(out_buf), out_bytes); 463 464 EXPECT_EQ(0, iconv_close(c)); 465 } 466