1*71db0c75SAndroid Build Coastguard Worker //===-- String utils --------------------------------------------*- C++ -*-===//
2*71db0c75SAndroid Build Coastguard Worker //
3*71db0c75SAndroid Build Coastguard Worker // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*71db0c75SAndroid Build Coastguard Worker // See https://llvm.org/LICENSE.txt for license information.
5*71db0c75SAndroid Build Coastguard Worker // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*71db0c75SAndroid Build Coastguard Worker //
7*71db0c75SAndroid Build Coastguard Worker //===----------------------------------------------------------------------===//
8*71db0c75SAndroid Build Coastguard Worker //
9*71db0c75SAndroid Build Coastguard Worker // Standalone string utility functions. Utilities requiring memory allocations
10*71db0c75SAndroid Build Coastguard Worker // should be placed in allocating_string_utils.h instead.
11*71db0c75SAndroid Build Coastguard Worker //
12*71db0c75SAndroid Build Coastguard Worker //===----------------------------------------------------------------------===//
13*71db0c75SAndroid Build Coastguard Worker
14*71db0c75SAndroid Build Coastguard Worker #ifndef LLVM_LIBC_SRC_STRING_STRING_UTILS_H
15*71db0c75SAndroid Build Coastguard Worker #define LLVM_LIBC_SRC_STRING_STRING_UTILS_H
16*71db0c75SAndroid Build Coastguard Worker
17*71db0c75SAndroid Build Coastguard Worker #include "src/__support/CPP/bitset.h"
18*71db0c75SAndroid Build Coastguard Worker #include "src/__support/macros/config.h"
19*71db0c75SAndroid Build Coastguard Worker #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
20*71db0c75SAndroid Build Coastguard Worker #include "src/string/memory_utils/inline_bzero.h"
21*71db0c75SAndroid Build Coastguard Worker #include "src/string/memory_utils/inline_memcpy.h"
22*71db0c75SAndroid Build Coastguard Worker #include <stddef.h> // For size_t
23*71db0c75SAndroid Build Coastguard Worker
24*71db0c75SAndroid Build Coastguard Worker namespace LIBC_NAMESPACE_DECL {
25*71db0c75SAndroid Build Coastguard Worker namespace internal {
26*71db0c75SAndroid Build Coastguard Worker
repeat_byte(Word byte)27*71db0c75SAndroid Build Coastguard Worker template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) {
28*71db0c75SAndroid Build Coastguard Worker constexpr size_t BITS_IN_BYTE = 8;
29*71db0c75SAndroid Build Coastguard Worker constexpr size_t BYTE_MASK = 0xff;
30*71db0c75SAndroid Build Coastguard Worker Word result = 0;
31*71db0c75SAndroid Build Coastguard Worker byte = byte & BYTE_MASK;
32*71db0c75SAndroid Build Coastguard Worker for (size_t i = 0; i < sizeof(Word); ++i)
33*71db0c75SAndroid Build Coastguard Worker result = (result << BITS_IN_BYTE) | byte;
34*71db0c75SAndroid Build Coastguard Worker return result;
35*71db0c75SAndroid Build Coastguard Worker }
36*71db0c75SAndroid Build Coastguard Worker
37*71db0c75SAndroid Build Coastguard Worker // The goal of this function is to take in a block of arbitrary size and return
38*71db0c75SAndroid Build Coastguard Worker // if it has any bytes equal to zero without branching. This is done by
39*71db0c75SAndroid Build Coastguard Worker // transforming the block such that zero bytes become non-zero and non-zero
40*71db0c75SAndroid Build Coastguard Worker // bytes become zero.
41*71db0c75SAndroid Build Coastguard Worker // The first transformation relies on the properties of carrying in arithmetic
42*71db0c75SAndroid Build Coastguard Worker // subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00,
43*71db0c75SAndroid Build Coastguard Worker // then the result for that byte must be equal to 0xff (or 0xfe if the next byte
44*71db0c75SAndroid Build Coastguard Worker // needs a carry as well).
45*71db0c75SAndroid Build Coastguard Worker // The next transformation is a simple mask. All zero bytes will have the high
46*71db0c75SAndroid Build Coastguard Worker // bit set after the subtraction, so each byte is masked with 0x80. This narrows
47*71db0c75SAndroid Build Coastguard Worker // the set of bytes that result in a non-zero value to only zero bytes and bytes
48*71db0c75SAndroid Build Coastguard Worker // with the high bit and any other bit set.
49*71db0c75SAndroid Build Coastguard Worker // The final transformation masks the result of the previous transformations
50*71db0c75SAndroid Build Coastguard Worker // with the inverse of the original byte. This means that any byte that had the
51*71db0c75SAndroid Build Coastguard Worker // high bit set will no longer have it set, narrowing the list of bytes which
52*71db0c75SAndroid Build Coastguard Worker // result in non-zero values to just the zero byte.
has_zeroes(Word block)53*71db0c75SAndroid Build Coastguard Worker template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
54*71db0c75SAndroid Build Coastguard Worker constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
55*71db0c75SAndroid Build Coastguard Worker constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
56*71db0c75SAndroid Build Coastguard Worker Word subtracted = block - LOW_BITS;
57*71db0c75SAndroid Build Coastguard Worker Word inverted = ~block;
58*71db0c75SAndroid Build Coastguard Worker return (subtracted & inverted & HIGH_BITS) != 0;
59*71db0c75SAndroid Build Coastguard Worker }
60*71db0c75SAndroid Build Coastguard Worker
61*71db0c75SAndroid Build Coastguard Worker template <typename Word>
string_length_wide_read(const char * src)62*71db0c75SAndroid Build Coastguard Worker LIBC_INLINE size_t string_length_wide_read(const char *src) {
63*71db0c75SAndroid Build Coastguard Worker const char *char_ptr = src;
64*71db0c75SAndroid Build Coastguard Worker // Step 1: read 1 byte at a time to align to block size
65*71db0c75SAndroid Build Coastguard Worker for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
66*71db0c75SAndroid Build Coastguard Worker ++char_ptr) {
67*71db0c75SAndroid Build Coastguard Worker if (*char_ptr == '\0')
68*71db0c75SAndroid Build Coastguard Worker return char_ptr - src;
69*71db0c75SAndroid Build Coastguard Worker }
70*71db0c75SAndroid Build Coastguard Worker // Step 2: read blocks
71*71db0c75SAndroid Build Coastguard Worker for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
72*71db0c75SAndroid Build Coastguard Worker !has_zeroes<Word>(*block_ptr); ++block_ptr) {
73*71db0c75SAndroid Build Coastguard Worker char_ptr = reinterpret_cast<const char *>(block_ptr);
74*71db0c75SAndroid Build Coastguard Worker }
75*71db0c75SAndroid Build Coastguard Worker // Step 3: find the zero in the block
76*71db0c75SAndroid Build Coastguard Worker for (; *char_ptr != '\0'; ++char_ptr) {
77*71db0c75SAndroid Build Coastguard Worker ;
78*71db0c75SAndroid Build Coastguard Worker }
79*71db0c75SAndroid Build Coastguard Worker return char_ptr - src;
80*71db0c75SAndroid Build Coastguard Worker }
81*71db0c75SAndroid Build Coastguard Worker
string_length_byte_read(const char * src)82*71db0c75SAndroid Build Coastguard Worker LIBC_INLINE size_t string_length_byte_read(const char *src) {
83*71db0c75SAndroid Build Coastguard Worker size_t length;
84*71db0c75SAndroid Build Coastguard Worker for (length = 0; *src; ++src, ++length)
85*71db0c75SAndroid Build Coastguard Worker ;
86*71db0c75SAndroid Build Coastguard Worker return length;
87*71db0c75SAndroid Build Coastguard Worker }
88*71db0c75SAndroid Build Coastguard Worker
89*71db0c75SAndroid Build Coastguard Worker // Returns the length of a string, denoted by the first occurrence
90*71db0c75SAndroid Build Coastguard Worker // of a null terminator.
string_length(const char * src)91*71db0c75SAndroid Build Coastguard Worker LIBC_INLINE size_t string_length(const char *src) {
92*71db0c75SAndroid Build Coastguard Worker #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
93*71db0c75SAndroid Build Coastguard Worker // Unsigned int is the default size for most processors, and on x86-64 it
94*71db0c75SAndroid Build Coastguard Worker // performs better than larger sizes when the src pointer can't be assumed to
95*71db0c75SAndroid Build Coastguard Worker // be aligned to a word boundary, so it's the size we use for reading the
96*71db0c75SAndroid Build Coastguard Worker // string a block at a time.
97*71db0c75SAndroid Build Coastguard Worker return string_length_wide_read<unsigned int>(src);
98*71db0c75SAndroid Build Coastguard Worker #else
99*71db0c75SAndroid Build Coastguard Worker return string_length_byte_read(src);
100*71db0c75SAndroid Build Coastguard Worker #endif
101*71db0c75SAndroid Build Coastguard Worker }
102*71db0c75SAndroid Build Coastguard Worker
103*71db0c75SAndroid Build Coastguard Worker template <typename Word>
find_first_character_wide_read(const unsigned char * src,unsigned char ch,size_t n)104*71db0c75SAndroid Build Coastguard Worker LIBC_INLINE void *find_first_character_wide_read(const unsigned char *src,
105*71db0c75SAndroid Build Coastguard Worker unsigned char ch, size_t n) {
106*71db0c75SAndroid Build Coastguard Worker const unsigned char *char_ptr = src;
107*71db0c75SAndroid Build Coastguard Worker size_t cur = 0;
108*71db0c75SAndroid Build Coastguard Worker
109*71db0c75SAndroid Build Coastguard Worker // Step 1: read 1 byte at a time to align to block size
110*71db0c75SAndroid Build Coastguard Worker for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
111*71db0c75SAndroid Build Coastguard Worker ++char_ptr, ++cur) {
112*71db0c75SAndroid Build Coastguard Worker if (*char_ptr == ch)
113*71db0c75SAndroid Build Coastguard Worker return const_cast<unsigned char *>(char_ptr);
114*71db0c75SAndroid Build Coastguard Worker }
115*71db0c75SAndroid Build Coastguard Worker
116*71db0c75SAndroid Build Coastguard Worker const Word ch_mask = repeat_byte<Word>(ch);
117*71db0c75SAndroid Build Coastguard Worker
118*71db0c75SAndroid Build Coastguard Worker // Step 2: read blocks
119*71db0c75SAndroid Build Coastguard Worker for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
120*71db0c75SAndroid Build Coastguard Worker !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n;
121*71db0c75SAndroid Build Coastguard Worker ++block_ptr, cur += sizeof(Word)) {
122*71db0c75SAndroid Build Coastguard Worker char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
123*71db0c75SAndroid Build Coastguard Worker }
124*71db0c75SAndroid Build Coastguard Worker
125*71db0c75SAndroid Build Coastguard Worker // Step 3: find the match in the block
126*71db0c75SAndroid Build Coastguard Worker for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) {
127*71db0c75SAndroid Build Coastguard Worker ;
128*71db0c75SAndroid Build Coastguard Worker }
129*71db0c75SAndroid Build Coastguard Worker
130*71db0c75SAndroid Build Coastguard Worker if (*char_ptr != ch || cur >= n)
131*71db0c75SAndroid Build Coastguard Worker return static_cast<void *>(nullptr);
132*71db0c75SAndroid Build Coastguard Worker
133*71db0c75SAndroid Build Coastguard Worker return const_cast<unsigned char *>(char_ptr);
134*71db0c75SAndroid Build Coastguard Worker }
135*71db0c75SAndroid Build Coastguard Worker
find_first_character_byte_read(const unsigned char * src,unsigned char ch,size_t n)136*71db0c75SAndroid Build Coastguard Worker LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src,
137*71db0c75SAndroid Build Coastguard Worker unsigned char ch, size_t n) {
138*71db0c75SAndroid Build Coastguard Worker for (; n && *src != ch; --n, ++src)
139*71db0c75SAndroid Build Coastguard Worker ;
140*71db0c75SAndroid Build Coastguard Worker return n ? const_cast<unsigned char *>(src) : nullptr;
141*71db0c75SAndroid Build Coastguard Worker }
142*71db0c75SAndroid Build Coastguard Worker
143*71db0c75SAndroid Build Coastguard Worker // Returns the first occurrence of 'ch' within the first 'n' characters of
144*71db0c75SAndroid Build Coastguard Worker // 'src'. If 'ch' is not found, returns nullptr.
find_first_character(const unsigned char * src,unsigned char ch,size_t max_strlen)145*71db0c75SAndroid Build Coastguard Worker LIBC_INLINE void *find_first_character(const unsigned char *src,
146*71db0c75SAndroid Build Coastguard Worker unsigned char ch, size_t max_strlen) {
147*71db0c75SAndroid Build Coastguard Worker #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
148*71db0c75SAndroid Build Coastguard Worker // If the maximum size of the string is small, the overhead of aligning to a
149*71db0c75SAndroid Build Coastguard Worker // word boundary and generating a bitmask of the appropriate size may be
150*71db0c75SAndroid Build Coastguard Worker // greater than the gains from reading larger chunks. Based on some testing,
151*71db0c75SAndroid Build Coastguard Worker // the crossover point between when it's faster to just read bytewise and read
152*71db0c75SAndroid Build Coastguard Worker // blocks is somewhere between 16 and 32, so 4 times the size of the block
153*71db0c75SAndroid Build Coastguard Worker // should be in that range.
154*71db0c75SAndroid Build Coastguard Worker // Unsigned int is used for the same reason as in strlen.
155*71db0c75SAndroid Build Coastguard Worker using BlockType = unsigned int;
156*71db0c75SAndroid Build Coastguard Worker if (max_strlen > (sizeof(BlockType) * 4)) {
157*71db0c75SAndroid Build Coastguard Worker return find_first_character_wide_read<BlockType>(src, ch, max_strlen);
158*71db0c75SAndroid Build Coastguard Worker }
159*71db0c75SAndroid Build Coastguard Worker #endif
160*71db0c75SAndroid Build Coastguard Worker return find_first_character_byte_read(src, ch, max_strlen);
161*71db0c75SAndroid Build Coastguard Worker }
162*71db0c75SAndroid Build Coastguard Worker
163*71db0c75SAndroid Build Coastguard Worker // Returns the maximum length span that contains only characters not found in
164*71db0c75SAndroid Build Coastguard Worker // 'segment'. If no characters are found, returns the length of 'src'.
complementary_span(const char * src,const char * segment)165*71db0c75SAndroid Build Coastguard Worker LIBC_INLINE size_t complementary_span(const char *src, const char *segment) {
166*71db0c75SAndroid Build Coastguard Worker const char *initial = src;
167*71db0c75SAndroid Build Coastguard Worker cpp::bitset<256> bitset;
168*71db0c75SAndroid Build Coastguard Worker
169*71db0c75SAndroid Build Coastguard Worker for (; *segment; ++segment)
170*71db0c75SAndroid Build Coastguard Worker bitset.set(*reinterpret_cast<const unsigned char *>(segment));
171*71db0c75SAndroid Build Coastguard Worker for (; *src && !bitset.test(*reinterpret_cast<const unsigned char *>(src));
172*71db0c75SAndroid Build Coastguard Worker ++src)
173*71db0c75SAndroid Build Coastguard Worker ;
174*71db0c75SAndroid Build Coastguard Worker return src - initial;
175*71db0c75SAndroid Build Coastguard Worker }
176*71db0c75SAndroid Build Coastguard Worker
177*71db0c75SAndroid Build Coastguard Worker // Given the similarities between strtok and strtok_r, we can implement both
178*71db0c75SAndroid Build Coastguard Worker // using a utility function. On the first call, 'src' is scanned for the
179*71db0c75SAndroid Build Coastguard Worker // first character not found in 'delimiter_string'. Once found, it scans until
180*71db0c75SAndroid Build Coastguard Worker // the first character in the 'delimiter_string' or the null terminator is
181*71db0c75SAndroid Build Coastguard Worker // found. We define this span as a token. The end of the token is appended with
182*71db0c75SAndroid Build Coastguard Worker // a null terminator, and the token is returned. The point where the last token
183*71db0c75SAndroid Build Coastguard Worker // is found is then stored within 'context' for subsequent calls. Subsequent
184*71db0c75SAndroid Build Coastguard Worker // calls will use 'context' when a nullptr is passed in for 'src'. Once the null
185*71db0c75SAndroid Build Coastguard Worker // terminating character is reached, returns a nullptr.
186*71db0c75SAndroid Build Coastguard Worker template <bool SkipDelim = true>
string_token(char * __restrict src,const char * __restrict delimiter_string,char ** __restrict saveptr)187*71db0c75SAndroid Build Coastguard Worker LIBC_INLINE char *string_token(char *__restrict src,
188*71db0c75SAndroid Build Coastguard Worker const char *__restrict delimiter_string,
189*71db0c75SAndroid Build Coastguard Worker char **__restrict saveptr) {
190*71db0c75SAndroid Build Coastguard Worker // Return nullptr immediately if both src AND saveptr are nullptr
191*71db0c75SAndroid Build Coastguard Worker if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr)))
192*71db0c75SAndroid Build Coastguard Worker return nullptr;
193*71db0c75SAndroid Build Coastguard Worker
194*71db0c75SAndroid Build Coastguard Worker cpp::bitset<256> delimiter_set;
195*71db0c75SAndroid Build Coastguard Worker for (; *delimiter_string != '\0'; ++delimiter_string)
196*71db0c75SAndroid Build Coastguard Worker delimiter_set.set(*delimiter_string);
197*71db0c75SAndroid Build Coastguard Worker
198*71db0c75SAndroid Build Coastguard Worker if constexpr (SkipDelim)
199*71db0c75SAndroid Build Coastguard Worker for (; *src != '\0' && delimiter_set.test(*src); ++src)
200*71db0c75SAndroid Build Coastguard Worker ;
201*71db0c75SAndroid Build Coastguard Worker if (*src == '\0') {
202*71db0c75SAndroid Build Coastguard Worker *saveptr = src;
203*71db0c75SAndroid Build Coastguard Worker return nullptr;
204*71db0c75SAndroid Build Coastguard Worker }
205*71db0c75SAndroid Build Coastguard Worker char *token = src;
206*71db0c75SAndroid Build Coastguard Worker for (; *src != '\0'; ++src) {
207*71db0c75SAndroid Build Coastguard Worker if (delimiter_set.test(*src)) {
208*71db0c75SAndroid Build Coastguard Worker *src = '\0';
209*71db0c75SAndroid Build Coastguard Worker ++src;
210*71db0c75SAndroid Build Coastguard Worker break;
211*71db0c75SAndroid Build Coastguard Worker }
212*71db0c75SAndroid Build Coastguard Worker }
213*71db0c75SAndroid Build Coastguard Worker *saveptr = src;
214*71db0c75SAndroid Build Coastguard Worker return token;
215*71db0c75SAndroid Build Coastguard Worker }
216*71db0c75SAndroid Build Coastguard Worker
strlcpy(char * __restrict dst,const char * __restrict src,size_t size)217*71db0c75SAndroid Build Coastguard Worker LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src,
218*71db0c75SAndroid Build Coastguard Worker size_t size) {
219*71db0c75SAndroid Build Coastguard Worker size_t len = internal::string_length(src);
220*71db0c75SAndroid Build Coastguard Worker if (!size)
221*71db0c75SAndroid Build Coastguard Worker return len;
222*71db0c75SAndroid Build Coastguard Worker size_t n = len < size - 1 ? len : size - 1;
223*71db0c75SAndroid Build Coastguard Worker inline_memcpy(dst, src, n);
224*71db0c75SAndroid Build Coastguard Worker dst[n] = '\0';
225*71db0c75SAndroid Build Coastguard Worker return len;
226*71db0c75SAndroid Build Coastguard Worker }
227*71db0c75SAndroid Build Coastguard Worker
228*71db0c75SAndroid Build Coastguard Worker template <bool ReturnNull = true>
strchr_implementation(const char * src,int c)229*71db0c75SAndroid Build Coastguard Worker LIBC_INLINE constexpr static char *strchr_implementation(const char *src,
230*71db0c75SAndroid Build Coastguard Worker int c) {
231*71db0c75SAndroid Build Coastguard Worker char ch = static_cast<char>(c);
232*71db0c75SAndroid Build Coastguard Worker for (; *src && *src != ch; ++src)
233*71db0c75SAndroid Build Coastguard Worker ;
234*71db0c75SAndroid Build Coastguard Worker char *ret = ReturnNull ? nullptr : const_cast<char *>(src);
235*71db0c75SAndroid Build Coastguard Worker return *src == ch ? const_cast<char *>(src) : ret;
236*71db0c75SAndroid Build Coastguard Worker }
237*71db0c75SAndroid Build Coastguard Worker
strrchr_implementation(const char * src,int c)238*71db0c75SAndroid Build Coastguard Worker LIBC_INLINE constexpr static char *strrchr_implementation(const char *src,
239*71db0c75SAndroid Build Coastguard Worker int c) {
240*71db0c75SAndroid Build Coastguard Worker char ch = static_cast<char>(c);
241*71db0c75SAndroid Build Coastguard Worker char *last_occurrence = nullptr;
242*71db0c75SAndroid Build Coastguard Worker while (true) {
243*71db0c75SAndroid Build Coastguard Worker if (*src == ch)
244*71db0c75SAndroid Build Coastguard Worker last_occurrence = const_cast<char *>(src);
245*71db0c75SAndroid Build Coastguard Worker if (!*src)
246*71db0c75SAndroid Build Coastguard Worker return last_occurrence;
247*71db0c75SAndroid Build Coastguard Worker ++src;
248*71db0c75SAndroid Build Coastguard Worker }
249*71db0c75SAndroid Build Coastguard Worker }
250*71db0c75SAndroid Build Coastguard Worker
251*71db0c75SAndroid Build Coastguard Worker } // namespace internal
252*71db0c75SAndroid Build Coastguard Worker } // namespace LIBC_NAMESPACE_DECL
253*71db0c75SAndroid Build Coastguard Worker
254*71db0c75SAndroid Build Coastguard Worker #endif // LLVM_LIBC_SRC_STRING_STRING_UTILS_H
255