xref: /aosp_15_r20/external/mesa3d/src/util/streaming-load-memcpy.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1*61046927SAndroid Build Coastguard Worker /*
2*61046927SAndroid Build Coastguard Worker  * Copyright © 2013 Intel Corporation
3*61046927SAndroid Build Coastguard Worker  *
4*61046927SAndroid Build Coastguard Worker  * Permission is hereby granted, free of charge, to any person obtaining a
5*61046927SAndroid Build Coastguard Worker  * copy of this software and associated documentation files (the "Software"),
6*61046927SAndroid Build Coastguard Worker  * to deal in the Software without restriction, including without limitation
7*61046927SAndroid Build Coastguard Worker  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8*61046927SAndroid Build Coastguard Worker  * and/or sell copies of the Software, and to permit persons to whom the
9*61046927SAndroid Build Coastguard Worker  * Software is furnished to do so, subject to the following conditions:
10*61046927SAndroid Build Coastguard Worker  *
11*61046927SAndroid Build Coastguard Worker  * The above copyright notice and this permission notice (including the next
12*61046927SAndroid Build Coastguard Worker  * paragraph) shall be included in all copies or substantial portions of the
13*61046927SAndroid Build Coastguard Worker  * Software.
14*61046927SAndroid Build Coastguard Worker  *
15*61046927SAndroid Build Coastguard Worker  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16*61046927SAndroid Build Coastguard Worker  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17*61046927SAndroid Build Coastguard Worker  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18*61046927SAndroid Build Coastguard Worker  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19*61046927SAndroid Build Coastguard Worker  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20*61046927SAndroid Build Coastguard Worker  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21*61046927SAndroid Build Coastguard Worker  * IN THE SOFTWARE.
22*61046927SAndroid Build Coastguard Worker  *
23*61046927SAndroid Build Coastguard Worker  * Authors:
24*61046927SAndroid Build Coastguard Worker  *    Eric Anholt <[email protected]>
25*61046927SAndroid Build Coastguard Worker  *    Matt Turner <[email protected]>
26*61046927SAndroid Build Coastguard Worker  *
27*61046927SAndroid Build Coastguard Worker  */
28*61046927SAndroid Build Coastguard Worker 
29*61046927SAndroid Build Coastguard Worker #include "util/streaming-load-memcpy.h"
30*61046927SAndroid Build Coastguard Worker #include "util/u_cpu_detect.h"
31*61046927SAndroid Build Coastguard Worker #include "util/u_math.h"
32*61046927SAndroid Build Coastguard Worker #ifdef USE_SSE41
33*61046927SAndroid Build Coastguard Worker #include <smmintrin.h>
34*61046927SAndroid Build Coastguard Worker #endif
35*61046927SAndroid Build Coastguard Worker 
36*61046927SAndroid Build Coastguard Worker /* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming
37*61046927SAndroid Build Coastguard Worker  * read performance from uncached memory.
38*61046927SAndroid Build Coastguard Worker  */
39*61046927SAndroid Build Coastguard Worker void
util_streaming_load_memcpy(void * restrict dst,void * restrict src,size_t len)40*61046927SAndroid Build Coastguard Worker util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len)
41*61046927SAndroid Build Coastguard Worker {
42*61046927SAndroid Build Coastguard Worker    char *restrict d = dst;
43*61046927SAndroid Build Coastguard Worker    char *restrict s = src;
44*61046927SAndroid Build Coastguard Worker 
45*61046927SAndroid Build Coastguard Worker #ifdef USE_SSE41
46*61046927SAndroid Build Coastguard Worker    /* If dst and src are not co-aligned, or if SSE4.1 is not present, fallback to memcpy(). */
47*61046927SAndroid Build Coastguard Worker    if (((uintptr_t)d & 15) != ((uintptr_t)s & 15) || !util_get_cpu_caps()->has_sse4_1) {
48*61046927SAndroid Build Coastguard Worker       memcpy(d, s, len);
49*61046927SAndroid Build Coastguard Worker       return;
50*61046927SAndroid Build Coastguard Worker    }
51*61046927SAndroid Build Coastguard Worker 
52*61046927SAndroid Build Coastguard Worker    /* memcpy() the misaligned header. At the end of this if block, <d> and <s>
53*61046927SAndroid Build Coastguard Worker     * are aligned to a 16-byte boundary or <len> == 0.
54*61046927SAndroid Build Coastguard Worker     */
55*61046927SAndroid Build Coastguard Worker    if ((uintptr_t)d & 15) {
56*61046927SAndroid Build Coastguard Worker       uintptr_t bytes_before_alignment_boundary = 16 - ((uintptr_t)d & 15);
57*61046927SAndroid Build Coastguard Worker       assert(bytes_before_alignment_boundary < 16);
58*61046927SAndroid Build Coastguard Worker 
59*61046927SAndroid Build Coastguard Worker       memcpy(d, s, MIN2(bytes_before_alignment_boundary, len));
60*61046927SAndroid Build Coastguard Worker 
61*61046927SAndroid Build Coastguard Worker       d = (char *)align_uintptr((uintptr_t)d, 16);
62*61046927SAndroid Build Coastguard Worker       s = (char *)align_uintptr((uintptr_t)s, 16);
63*61046927SAndroid Build Coastguard Worker       len -= MIN2(bytes_before_alignment_boundary, len);
64*61046927SAndroid Build Coastguard Worker    }
65*61046927SAndroid Build Coastguard Worker 
66*61046927SAndroid Build Coastguard Worker    if (len >= 64)
67*61046927SAndroid Build Coastguard Worker       _mm_mfence();
68*61046927SAndroid Build Coastguard Worker 
69*61046927SAndroid Build Coastguard Worker    while (len >= 64) {
70*61046927SAndroid Build Coastguard Worker       __m128i *dst_cacheline = (__m128i *)d;
71*61046927SAndroid Build Coastguard Worker       __m128i *src_cacheline = (__m128i *)s;
72*61046927SAndroid Build Coastguard Worker 
73*61046927SAndroid Build Coastguard Worker       __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0);
74*61046927SAndroid Build Coastguard Worker       __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1);
75*61046927SAndroid Build Coastguard Worker       __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2);
76*61046927SAndroid Build Coastguard Worker       __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);
77*61046927SAndroid Build Coastguard Worker 
78*61046927SAndroid Build Coastguard Worker       _mm_store_si128(dst_cacheline + 0, temp1);
79*61046927SAndroid Build Coastguard Worker       _mm_store_si128(dst_cacheline + 1, temp2);
80*61046927SAndroid Build Coastguard Worker       _mm_store_si128(dst_cacheline + 2, temp3);
81*61046927SAndroid Build Coastguard Worker       _mm_store_si128(dst_cacheline + 3, temp4);
82*61046927SAndroid Build Coastguard Worker 
83*61046927SAndroid Build Coastguard Worker       d += 64;
84*61046927SAndroid Build Coastguard Worker       s += 64;
85*61046927SAndroid Build Coastguard Worker       len -= 64;
86*61046927SAndroid Build Coastguard Worker    }
87*61046927SAndroid Build Coastguard Worker #endif
88*61046927SAndroid Build Coastguard Worker    /* memcpy() the tail. */
89*61046927SAndroid Build Coastguard Worker    if (len) {
90*61046927SAndroid Build Coastguard Worker       memcpy(d, s, len);
91*61046927SAndroid Build Coastguard Worker    }
92*61046927SAndroid Build Coastguard Worker }
93