xref: /aosp_15_r20/external/llvm-libc/src/string/memory_utils/op_x86.h (revision 71db0c75aadcf003ffe3238005f61d7618a3fead)
1 //===-- x86 implementation of memory function building blocks -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file provides x86 specific building blocks to compose memory functions.
10 //
11 //===----------------------------------------------------------------------===//
12 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
13 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
14 
15 #include "src/__support/macros/config.h"
16 #include "src/__support/macros/properties/architectures.h"
17 
18 #if defined(LIBC_TARGET_ARCH_IS_X86)
19 
20 #include "src/__support/common.h"
21 #include "src/string/memory_utils/op_builtin.h"
22 #include "src/string/memory_utils/op_generic.h"
23 
24 #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) ||      \
25     defined(__SSE2__)
26 #include <immintrin.h>
27 #endif
28 
29 // Define fake functions to prevent the compiler from failing on undefined
30 // functions in case the CPU extension is not present.
31 #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__))
32 #undef _mm512_cmpneq_epi8_mask
33 #define _mm512_cmpneq_epi8_mask(A, B) 0
34 #endif
35 #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__))
36 #undef _mm256_movemask_epi8
37 #define _mm256_movemask_epi8(A) 0
38 #endif
39 #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__))
40 #undef _mm_movemask_epi8
41 #define _mm_movemask_epi8(A) 0
42 #endif
43 
44 namespace LIBC_NAMESPACE_DECL {
45 namespace x86 {
46 
47 // A set of constants to check compile time features.
48 LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
49 LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__);
50 LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
51 LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
52 LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
53 LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
54 
55 ///////////////////////////////////////////////////////////////////////////////
56 // Memcpy repmovsb implementation
57 struct Memcpy {
repmovsbMemcpy58   LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) {
59     asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
60   }
61 };
62 
63 } // namespace x86
64 } // namespace LIBC_NAMESPACE_DECL
65 
66 namespace LIBC_NAMESPACE_DECL {
67 namespace generic {
68 
69 // Not equals: returns non-zero iff values at head or tail differ.
70 // This function typically loads more data than necessary when the two buffer
71 // differs.
72 template <typename T>
branchless_head_tail_neq(CPtr p1,CPtr p2,size_t count)73 LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
74   static_assert(cpp::is_integral_v<T>);
75   return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
76 }
77 
78 ///////////////////////////////////////////////////////////////////////////////
79 // Specializations for uint16_t
80 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
81 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
82   return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
83 }
84 template <>
85 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
86   return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
87 }
88 template <>
89 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
90   return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
91          static_cast<int32_t>(load_be<uint16_t>(p2, offset));
92 }
93 template <>
94 LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);
95 
96 ///////////////////////////////////////////////////////////////////////////////
97 // Specializations for uint32_t
98 template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
99 template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
100   return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset);
101 }
102 template <>
103 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
104   return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
105 }
106 template <>
107 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
108   const auto a = load_be<uint32_t>(p1, offset);
109   const auto b = load_be<uint32_t>(p2, offset);
110   return cmp_uint32_t(a, b);
111 }
112 template <>
113 LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);
114 
115 ///////////////////////////////////////////////////////////////////////////////
116 // Specializations for uint64_t
117 template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
118 template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
119   return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset);
120 }
121 template <>
122 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
123   return !eq<uint64_t>(p1, p2, offset);
124 }
125 template <>
126 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
127 template <>
128 LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
129                                                size_t offset) {
130   const auto a = load_be<uint64_t>(p1, offset);
131   const auto b = load_be<uint64_t>(p2, offset);
132   return cmp_neq_uint64_t(a, b);
133 }
134 
135 // SIMD types are defined with attributes. e.g., '__m128i' is defined as
136 // long long  __attribute__((__vector_size__(16), __aligned__(16)))
137 // When we use these SIMD types in template specialization GCC complains:
138 // "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
139 // Therefore, we disable this warning in this file.
140 #pragma GCC diagnostic push
141 #pragma GCC diagnostic ignored "-Wignored-attributes"
142 
143 ///////////////////////////////////////////////////////////////////////////////
144 // Specializations for __m128i
145 #if defined(__SSE4_1__)
146 template <> struct is_vector<__m128i> : cpp::true_type {};
147 template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
148 LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
149   const auto a = load<__m128i>(p1, offset);
150   const auto b = load<__m128i>(p2, offset);
151   return _mm_xor_si128(a, b);
152 }
153 LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
154   return _mm_max_epu8(a, b);
155 }
156 LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
157   return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
158                                               8, 9, 10, 11, 12, 13, 14, 15));
159 }
160 LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
161   return static_cast<uint16_t>(
162       _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
163 }
164 LIBC_INLINE bool is_zero(__m128i value) {
165   return _mm_testz_si128(value, value) == 1;
166 }
167 template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
168   return is_zero(load_and_xor_m128i(p1, p2, offset));
169 }
170 template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
171   return !is_zero(load_and_xor_m128i(p1, p2, offset));
172 }
173 template <>
174 LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
175                                                        size_t count) {
176   const __m128i head = load_and_xor_m128i(p1, p2, 0);
177   const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
178   return !is_zero(_mm_or_si128(head, tail));
179 }
180 template <>
181 LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
182   const auto a = load<__m128i>(p1, offset);
183   const auto b = load<__m128i>(p2, offset);
184   const auto vmax = bytewise_max(a, b);
185   const auto le = big_endian_cmp_mask(vmax, b);
186   const auto ge = big_endian_cmp_mask(vmax, a);
187   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
188   return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
189 }
190 #endif // __SSE4_1__
191 
192 ///////////////////////////////////////////////////////////////////////////////
193 // Specializations for __m256i
194 #if defined(__AVX__)
195 template <> struct is_vector<__m256i> : cpp::true_type {};
196 template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
197 LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
198   return _mm256_castps_si256(
199       _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
200 }
201 LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
202   return _mm256_castps_si256(
203       _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
204 }
205 LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
206   const auto a = load<__m256i>(p1, offset);
207   const auto b = load<__m256i>(p2, offset);
208   return xor_m256i(a, b);
209 }
210 LIBC_INLINE bool is_zero(__m256i value) {
211   return _mm256_testz_si256(value, value) == 1;
212 }
213 template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
214   return is_zero(load_and_xor_m256i(p1, p2, offset));
215 }
216 template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
217   return !is_zero(load_and_xor_m256i(p1, p2, offset));
218 }
219 template <>
220 LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
221                                                        size_t count) {
222   const __m256i head = load_and_xor_m256i(p1, p2, 0);
223   const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
224   return !is_zero(or_m256i(head, tail));
225 }
226 #endif // __AVX__
227 
228 #if defined(__AVX2__)
229 LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
230   return _mm256_max_epu8(a, b);
231 }
232 LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
233   // Bytewise comparison of 'max' and 'value'.
234   const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value);
235   // Because x86 is little endian, bytes in the vector must be reversed before
236   // using movemask.
237 #if defined(__AVX512VBMI__) && defined(__AVX512VL__)
238   // When AVX512BMI is available we can completely reverse the vector through
239   // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
240   const __m256i big_endian_byte_mask =
241       _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
242                                               8, 9, 10, 11, 12, 13, 14, 15,   //
243                                               16, 17, 18, 19, 20, 21, 22, 23, //
244                                               24, 25, 26, 27, 28, 29, 30, 31),
245                               little_endian_byte_mask);
246   // And turn the byte vector mask into an 'uint32_t' for direct scalar
247   // comparison.
248   return _mm256_movemask_epi8(big_endian_byte_mask);
249 #else
250   // We can't byte-reverse '__m256i' in a single instruction with AVX2.
251   // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
252   // leading to:
253   // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
254   //           31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
255   // So we first shuffle each 16-byte lane leading to half-reversed vector mask.
256   const __m256i half_reversed = _mm256_shuffle_epi8(
257       little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,       //
258                                                8, 9, 10, 11, 12, 13, 14, 15, //
259                                                0, 1, 2, 3, 4, 5, 6, 7,       //
260                                                8, 9, 10, 11, 12, 13, 14, 15));
261   // Then we turn the vector into an uint32_t.
262   const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed);
263   // And swap the lower and upper parts. This is optimized into a single `rorx`
264   // instruction.
265   return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16);
266 #endif
267 }
268 template <>
269 LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
270   const auto a = load<__m256i>(p1, offset);
271   const auto b = load<__m256i>(p2, offset);
272   const auto vmax = bytewise_max(a, b);
273   const auto le = big_endian_cmp_mask(vmax, b);
274   const auto ge = big_endian_cmp_mask(vmax, a);
275   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
276   return cmp_neq_uint64_t(ge, le);
277 }
278 #endif // __AVX2__
279 
280 ///////////////////////////////////////////////////////////////////////////////
281 // Specializations for __m512i
282 #if defined(__AVX512BW__)
283 template <> struct is_vector<__m512i> : cpp::true_type {};
284 template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
285 LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
286   return _mm512_max_epu8(a, b);
287 }
288 LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
289   // The AVX512BMI version is disabled due to bad codegen.
290   // https://github.com/llvm/llvm-project/issues/77459
291   // https://github.com/llvm/llvm-project/pull/77081
292   // TODO: Re-enable when clang version meets the fixed version.
293 #if false && defined(__AVX512VBMI__)
294   // When AVX512BMI is available we can completely reverse the vector through
295   // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
296   const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
297                                        8, 9, 10, 11, 12, 13, 14, 15,   //
298                                        16, 17, 18, 19, 20, 21, 22, 23, //
299                                        24, 25, 26, 27, 28, 29, 30, 31, //
300                                        32, 33, 34, 35, 36, 37, 38, 39, //
301                                        40, 41, 42, 43, 44, 45, 46, 47, //
302                                        48, 49, 50, 51, 52, 53, 54, 55, //
303                                        56, 57, 58, 59, 60, 61, 62, 63);
304   // Then we compute the mask for equal bytes.
305   return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
306                                 _mm512_permutexvar_epi8(indices, value));
307 #else
308   // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
309   // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
310   // So we only reverse groups of 8 bytes, these groups are necessarily within a
311   // 16-byte lane.
312   // zmm = | 16 bytes  | 16 bytes  | 16 bytes  | 16 bytes  |
313   // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> |
314   const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, //
315                                           0, 1, 2, 3, 4, 5, 6, 7,       //
316                                           8, 9, 10, 11, 12, 13, 14, 15, //
317                                           0, 1, 2, 3, 4, 5, 6, 7,       //
318                                           8, 9, 10, 11, 12, 13, 14, 15, //
319                                           0, 1, 2, 3, 4, 5, 6, 7,       //
320                                           8, 9, 10, 11, 12, 13, 14, 15, //
321                                           0, 1, 2, 3, 4, 5, 6, 7);
322   // Then we compute the mask for equal bytes. In this mask the bits of each
323   // byte are already reversed but the byte themselves should be reversed, this
324   // is done by using a bswap instruction.
325   return __builtin_bswap64(
326       _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), //
327                              _mm512_shuffle_epi8(value, indices)));
328 
329 #endif
330 }
331 template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
332   const auto a = load<__m512i>(p1, offset);
333   const auto b = load<__m512i>(p2, offset);
334   return _mm512_cmpneq_epi8_mask(a, b) == 0;
335 }
336 template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
337   const auto a = load<__m512i>(p1, offset);
338   const auto b = load<__m512i>(p2, offset);
339   return _mm512_cmpneq_epi8_mask(a, b) != 0;
340 }
341 LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
342   const auto a = load<__m512i>(p1, offset);
343   const auto b = load<__m512i>(p2, offset);
344   return _mm512_xor_epi64(a, b);
345 }
346 LIBC_INLINE bool is_zero(__m512i value) {
347   return _mm512_test_epi32_mask(value, value) == 0;
348 }
349 template <>
350 LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
351                                                        size_t count) {
352   const __m512i head = load_and_xor_m512i(p1, p2, 0);
353   const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
354   return !is_zero(_mm512_or_epi64(head, tail));
355 }
356 template <>
357 LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
358   const auto a = load<__m512i>(p1, offset);
359   const auto b = load<__m512i>(p2, offset);
360   const auto vmax = bytewise_max(a, b);
361   const auto le = big_endian_cmp_mask(vmax, b);
362   const auto ge = big_endian_cmp_mask(vmax, a);
363   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
364   return cmp_neq_uint64_t(ge, le);
365 }
366 #endif // __AVX512BW__
367 
368 #pragma GCC diagnostic pop
369 
370 } // namespace generic
371 } // namespace LIBC_NAMESPACE_DECL
372 
373 #endif // LIBC_TARGET_ARCH_IS_X86
374 
375 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
376