xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/common/activation_layer.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1*c217d954SCole FaustR"(
2*c217d954SCole Faust#if defined(ACT) && defined(DATA_TYPE) && defined(VEC_SIZE)
3*c217d954SCole Faust
4*c217d954SCole Faust#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
5*c217d954SCole Faust
6*c217d954SCole Faust
7*c217d954SCole Faust
8*c217d954SCole Faust
9*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
10*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
11*c217d954SCole Faust
12*c217d954SCole Faust
13*c217d954SCole Faust
14*c217d954SCole Faust
15*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
16*c217d954SCole Faust    VSTORE(N0)                                                 \
17*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
18*c217d954SCole Faust
19*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
20*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
21*c217d954SCole Faust    VSTORE(N0)                                                 \
22*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
23*c217d954SCole Faust
24*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
25*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
26*c217d954SCole Faust    VSTORE(N0)                                                 \
27*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
28*c217d954SCole Faust
29*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
30*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
31*c217d954SCole Faust    VSTORE(N0)                                                 \
32*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
33*c217d954SCole Faust
34*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
35*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
36*c217d954SCole Faust    VSTORE(N0)                                                 \
37*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
38*c217d954SCole Faust
39*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
40*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
41*c217d954SCole Faust    VSTORE(N0)                                                 \
42*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
43*c217d954SCole Faust
44*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
45*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
46*c217d954SCole Faust    VSTORE(N0)                                                 \
47*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
48*c217d954SCole Faust
49*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
50*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
51*c217d954SCole Faust    VSTORE(N0)                                                 \
52*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
53*c217d954SCole Faust
54*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
55*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
56*c217d954SCole Faust    VSTORE(N0)                                                 \
57*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
58*c217d954SCole Faust
59*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
60*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
61*c217d954SCole Faust    VSTORE(N0)                                                  \
62*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
63*c217d954SCole Faust
64*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
65*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
66*c217d954SCole Faust    VSTORE(N0)                                                  \
67*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
68*c217d954SCole Faust
69*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
70*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
71*c217d954SCole Faust    VSTORE(N0)                                                  \
72*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
73*c217d954SCole Faust
74*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
75*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
76*c217d954SCole Faust    VSTORE(N0)                                                  \
77*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
78*c217d954SCole Faust
79*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
80*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
81*c217d954SCole Faust    VSTORE(N0)                                                  \
82*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
83*c217d954SCole Faust
84*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
85*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
86*c217d954SCole Faust    VSTORE(N0)                                                  \
87*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
88*c217d954SCole Faust
89*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
90*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
91*c217d954SCole Faust    VSTORE(N0)                                                  \
92*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
93*c217d954SCole Faust
94*c217d954SCole Faust
95*c217d954SCole Faust
96*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
97*c217d954SCole Faust    VSTORE(N0)                                                         \
98*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
99*c217d954SCole Faust
100*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
101*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
102*c217d954SCole Faust    VSTORE(N0)                                                         \
103*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
104*c217d954SCole Faust
105*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
106*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
107*c217d954SCole Faust    VSTORE(N0)                                                         \
108*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
109*c217d954SCole Faust
110*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
111*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
112*c217d954SCole Faust    VSTORE(N0)                                                         \
113*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
114*c217d954SCole Faust
115*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
116*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
117*c217d954SCole Faust    VSTORE(N0)                                                         \
118*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
119*c217d954SCole Faust
120*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
121*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
122*c217d954SCole Faust    VSTORE(N0)                                                         \
123*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
124*c217d954SCole Faust
125*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
126*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
127*c217d954SCole Faust    VSTORE(N0)                                                         \
128*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
129*c217d954SCole Faust
130*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
131*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
132*c217d954SCole Faust    VSTORE(N0)                                                         \
133*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
134*c217d954SCole Faust
135*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
136*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
137*c217d954SCole Faust    VSTORE(N0)                                                         \
138*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
139*c217d954SCole Faust
140*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
141*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
142*c217d954SCole Faust    VSTORE(N0)                                                     \
143*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
144*c217d954SCole Faust
145*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
146*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
147*c217d954SCole Faust    VSTORE(N0)                                                          \
148*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
149*c217d954SCole Faust
150*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
151*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
152*c217d954SCole Faust    VSTORE(N0)                                                          \
153*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
154*c217d954SCole Faust
155*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
156*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
157*c217d954SCole Faust    VSTORE(N0)                                                          \
158*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
159*c217d954SCole Faust
160*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
161*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
162*c217d954SCole Faust    VSTORE(N0)                                                          \
163*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
164*c217d954SCole Faust
165*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
166*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
167*c217d954SCole Faust    VSTORE(N0)                                                          \
168*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
169*c217d954SCole Faust
170*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
171*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
172*c217d954SCole Faust    VSTORE(N0)                                                          \
173*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
174*c217d954SCole Faust
175*c217d954SCole Faust
176*c217d954SCole Faust
177*c217d954SCole Faust
178*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
179*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
180*c217d954SCole Faust
181*c217d954SCole Faust
182*c217d954SCole Faust
183*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
184*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
185*c217d954SCole Faust
186*c217d954SCole Faust
187*c217d954SCole Faust
188*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
189*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
190*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
191*c217d954SCole Faust
192*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
193*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
194*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
195*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
196*c217d954SCole Faust
197*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
198*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
199*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
200*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
201*c217d954SCole Faust
202*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
203*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
204*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
205*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
206*c217d954SCole Faust
207*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
208*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
209*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
210*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
211*c217d954SCole Faust
212*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
213*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
214*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
215*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
216*c217d954SCole Faust
217*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
218*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
219*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
220*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
221*c217d954SCole Faust
222*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
223*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
224*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
225*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
226*c217d954SCole Faust
227*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
228*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
229*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
230*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
231*c217d954SCole Faust
232*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
233*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
234*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
235*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
236*c217d954SCole Faust
237*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
238*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
239*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
240*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
241*c217d954SCole Faust
242*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
243*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
244*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
245*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
246*c217d954SCole Faust
247*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
248*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
249*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
250*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
251*c217d954SCole Faust
252*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
253*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
254*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
255*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
256*c217d954SCole Faust
257*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
258*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
259*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
260*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
261*c217d954SCole Faust
262*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
263*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
264*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
265*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
266*c217d954SCole Faust
267*c217d954SCole Faust
268*c217d954SCole Faust
269*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
270*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
271*c217d954SCole Faust
272*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
273*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
274*c217d954SCole Faust    {                                                                                                                                                     \
275*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
276*c217d954SCole Faust    }                                                                                                                                                     \
277*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
278*c217d954SCole Faust    {                                                                                                                                                     \
279*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
280*c217d954SCole Faust    }                                                                                                                                                     \
281*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
282*c217d954SCole Faust    {                                                                                                                                                     \
283*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
284*c217d954SCole Faust    }                                                                                                                                                     \
285*c217d954SCole Faust    else                                                                                                                                                  \
286*c217d954SCole Faust    {                                                                                                                                                     \
287*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
288*c217d954SCole Faust    }
289*c217d954SCole Faust
290*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
291*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
292*c217d954SCole Faust    {                                                                                                             \
293*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
294*c217d954SCole Faust    }                                                                                                             \
295*c217d954SCole Faust    else                                                                                                          \
296*c217d954SCole Faust    {                                                                                                             \
297*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
298*c217d954SCole Faust    }
299*c217d954SCole Faust
300*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
301*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
302*c217d954SCole Faust    {                                                                                                             \
303*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
304*c217d954SCole Faust    }                                                                                                             \
305*c217d954SCole Faust    else                                                                                                          \
306*c217d954SCole Faust    {                                                                                                             \
307*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
308*c217d954SCole Faust    }
309*c217d954SCole Faust
310*c217d954SCole Faust
311*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
312*c217d954SCole Faust
313*c217d954SCole Faust
314*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
315*c217d954SCole Faust
316*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
317*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
318*c217d954SCole Faust
319*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
320*c217d954SCole Faust
321*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
322*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
323*c217d954SCole Faust
324*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
325*c217d954SCole Faust
326*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
327*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
328*c217d954SCole Faust
329*c217d954SCole Faust#else
330*c217d954SCole Faust
331*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
332*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
333*c217d954SCole Faust
334*c217d954SCole Faust#endif
335*c217d954SCole Faust
336*c217d954SCole Faust#endif
337*c217d954SCole Faust
338*c217d954SCole Faust
339*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
340*c217d954SCole Faust
341*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
342*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
343*c217d954SCole Faust#else
344*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
345*c217d954SCole Faust    ((uint)(y * M0))
346*c217d954SCole Faust#endif
347*c217d954SCole Faust
348*c217d954SCole Faust
349*c217d954SCole Faust
350*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
351*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
352*c217d954SCole Faust
353*c217d954SCole Faust
354*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
355*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
356*c217d954SCole Faust#endif
357*c217d954SCole Faust
358*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
359*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
360*c217d954SCole Faust#endif
361*c217d954SCole Faust
362*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
363*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
364*c217d954SCole Faust#endif
365*c217d954SCole Faust
366*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
367*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
368*c217d954SCole Faust#endif
369*c217d954SCole Faust
370*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
371*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
372*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
373*c217d954SCole Faust
374*c217d954SCole Faust
375*c217d954SCole Faust#define CONCAT(a, b) a##b
376*c217d954SCole Faust
377*c217d954SCole Faust
378*c217d954SCole Faust#define EXPAND(x) x
379*c217d954SCole Faust
380*c217d954SCole Faust
381*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
382*c217d954SCole Faust
383*c217d954SCole Faust
384*c217d954SCole Faust#define REV1(x) ((x))
385*c217d954SCole Faust#define REV2(x) ((x).s10)
386*c217d954SCole Faust#define REV3(x) ((x).s210)
387*c217d954SCole Faust#define REV4(x) ((x).s3210)
388*c217d954SCole Faust#define REV8(x) ((x).s76543210)
389*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
390*c217d954SCole Faust
391*c217d954SCole Faust
392*c217d954SCole Faust
393*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
394*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
395*c217d954SCole Faust
396*c217d954SCole Faust
397*c217d954SCole Faust
398*c217d954SCole Faust#define ROT1_0(x) ((x))
399*c217d954SCole Faust#define ROT1_1(x) ((x))
400*c217d954SCole Faust
401*c217d954SCole Faust#define ROT2_0(x) ((x))
402*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
403*c217d954SCole Faust#define ROT2_2(x) ((x))
404*c217d954SCole Faust
405*c217d954SCole Faust#define ROT3_0(x) ((x))
406*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
407*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
408*c217d954SCole Faust#define ROT3_3(x) ((x))
409*c217d954SCole Faust
410*c217d954SCole Faust#define ROT4_0(x) ((x))
411*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
412*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
413*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
414*c217d954SCole Faust#define ROT4_4(x) ((x))
415*c217d954SCole Faust
416*c217d954SCole Faust#define ROT8_0(x) ((x))
417*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
418*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
419*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
420*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
421*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
422*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
423*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
424*c217d954SCole Faust#define ROT8_8(x) ((x))
425*c217d954SCole Faust
426*c217d954SCole Faust#define ROT16_0(x) ((x))
427*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
428*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
429*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
430*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
431*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
432*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
433*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
434*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
435*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
436*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
437*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
438*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
439*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
440*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
441*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
442*c217d954SCole Faust#define ROT16_16(x) ((x))
443*c217d954SCole Faust
444*c217d954SCole Faust
445*c217d954SCole Faust
446*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
447*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
448*c217d954SCole Faust
449*c217d954SCole Faust
450*c217d954SCole Faust
451*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
452*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
453*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
454*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
455*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
456*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
457*c217d954SCole Faust
458*c217d954SCole Faust
459*c217d954SCole Faust
460*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
461*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
462*c217d954SCole Faust
463*c217d954SCole Faust
464*c217d954SCole Faust#define VLOAD_STR(size) vload##size
465*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
466*c217d954SCole Faust
467*c217d954SCole Faust
468*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
469*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
470*c217d954SCole Faust
471*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
472*c217d954SCole Faust    {                            \
473*c217d954SCole Faust    }
474*c217d954SCole Faust
475*c217d954SCole Faust
476*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
477*c217d954SCole Faust#define vload_partial_1_1 vload1
478*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
479*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
480*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
481*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
482*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
483*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
484*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
485*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
486*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
487*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
488*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
489*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
490*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
491*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
492*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
493*c217d954SCole Faust
494*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
495*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
496*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
497*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
498*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
499*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
500*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
501*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
502*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
503*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
504*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
505*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
506*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
507*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
508*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
509*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
510*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
511*c217d954SCole Faust
512*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
513*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
514*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
515*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
516*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
517*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
518*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
519*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
520*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
521*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
522*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
523*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
524*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
525*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
526*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
527*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
528*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
529*c217d954SCole Faust
530*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
531*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
532*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
533*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
534*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
535*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
536*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
537*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
538*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
539*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
540*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
541*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
542*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
543*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
544*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
545*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
546*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
547*c217d954SCole Faust
548*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
549*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
550*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
551*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
552*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
553*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
554*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
555*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
556*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
557*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
558*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
559*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
560*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
561*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
562*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
563*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
564*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
565*c217d954SCole Faust
566*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
567*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
568*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
569*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
570*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
571*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
572*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
573*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
574*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
575*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
576*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
577*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
578*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
579*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
580*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
581*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
582*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
583*c217d954SCole Faust
584*c217d954SCole Faust
585*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
586*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
587*c217d954SCole Faust
588*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
589*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
590*c217d954SCole Faust
591*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
592*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
593*c217d954SCole Faust
594*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
595*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
596*c217d954SCole Faust
597*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
598*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
599*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
600*c217d954SCole Faust
601*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
602*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
603*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
604*c217d954SCole Faust
605*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
606*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
607*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
608*c217d954SCole Faust
609*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
610*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
611*c217d954SCole Faust
612*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
613*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
614*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
615*c217d954SCole Faust
616*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
617*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
618*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
619*c217d954SCole Faust
620*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
621*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
622*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
623*c217d954SCole Faust
624*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
625*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
626*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
627*c217d954SCole Faust
628*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
629*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
630*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
631*c217d954SCole Faust
632*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
633*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
634*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
635*c217d954SCole Faust
636*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
637*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
638*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
639*c217d954SCole Faust
640*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
641*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
642*c217d954SCole Faust
643*c217d954SCole Faust
644*c217d954SCole Faust
645*c217d954SCole Faust#define PIXEL_UNIT4 1
646*c217d954SCole Faust#define PIXEL_UNIT8 2
647*c217d954SCole Faust#define PIXEL_UNIT16 4
648*c217d954SCole Faust
649*c217d954SCole Faust
650*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
651*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
652*c217d954SCole Faust
653*c217d954SCole Faust
654*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
655*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
656*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
657*c217d954SCole Faust
658*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
659*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
660*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
661*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
662*c217d954SCole Faust#endif
663*c217d954SCole Faust
664*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
665*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
666*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
667*c217d954SCole Faust
668*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
669*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
670*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
671*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
672*c217d954SCole Faust#endif
673*c217d954SCole Faust
674*c217d954SCole Faust
675*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
676*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
677*c217d954SCole Faust
678*c217d954SCole Faust
679*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
680*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
681*c217d954SCole Faust
682*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
683*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
684*c217d954SCole Faust
685*c217d954SCole Faust#define float1 float
686*c217d954SCole Faust#define half1 half
687*c217d954SCole Faust#define char1 char
688*c217d954SCole Faust#define uchar1 uchar
689*c217d954SCole Faust#define short1 short
690*c217d954SCole Faust#define ushort1 ushort
691*c217d954SCole Faust#define int1 int
692*c217d954SCole Faust#define uint1 uint
693*c217d954SCole Faust#define long1 long
694*c217d954SCole Faust#define ulong1 ulong
695*c217d954SCole Faust#define double1 double
696*c217d954SCole Faust
697*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
698*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
699*c217d954SCole Faust
700*c217d954SCole Faust
701*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
702*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
703*c217d954SCole Faust
704*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
705*c217d954SCole Faust    {                             \
706*c217d954SCole Faust    }
707*c217d954SCole Faust
708*c217d954SCole Faust
709*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
710*c217d954SCole Faust#define vstore_partial_1_1 vstore1
711*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
712*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
713*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
714*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
715*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
716*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
717*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
718*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
719*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
720*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
721*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
722*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
723*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
724*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
725*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
726*c217d954SCole Faust
727*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
728*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
729*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
730*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
731*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
732*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
733*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
734*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
735*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
736*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
737*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
738*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
739*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
740*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
741*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
742*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
743*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
744*c217d954SCole Faust
745*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
746*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
747*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
748*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
749*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
750*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
751*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
752*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
753*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
754*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
755*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
756*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
757*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
758*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
759*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
760*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
761*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
762*c217d954SCole Faust
763*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
764*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
765*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
766*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
767*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
768*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
769*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
770*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
771*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
772*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
773*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
774*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
775*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
776*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
777*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
778*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
779*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
780*c217d954SCole Faust
781*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
782*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
783*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
784*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
785*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
786*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
787*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
788*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
789*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
790*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
791*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
792*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
793*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
794*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
795*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
796*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
797*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
798*c217d954SCole Faust
799*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
800*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
801*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
802*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
803*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
804*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
805*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
806*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
807*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
808*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
809*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
810*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
811*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
812*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
813*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
814*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
815*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
816*c217d954SCole Faust
817*c217d954SCole Faust
818*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
819*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
820*c217d954SCole Faust
821*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
822*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
823*c217d954SCole Faust
824*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
825*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
826*c217d954SCole Faust
827*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
828*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
829*c217d954SCole Faust
830*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
831*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
832*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
833*c217d954SCole Faust
834*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
835*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
836*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
837*c217d954SCole Faust
838*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
839*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
840*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
841*c217d954SCole Faust
842*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
843*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
844*c217d954SCole Faust
845*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
846*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
847*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
848*c217d954SCole Faust
849*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
850*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
851*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
852*c217d954SCole Faust
853*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
854*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
855*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
856*c217d954SCole Faust
857*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
858*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
859*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
860*c217d954SCole Faust
861*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
862*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
863*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
864*c217d954SCole Faust
865*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
866*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
867*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
868*c217d954SCole Faust
869*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
870*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
871*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
872*c217d954SCole Faust
873*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
874*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
875*c217d954SCole Faust
876*c217d954SCole Faust
877*c217d954SCole Faust
878*c217d954SCole Faust
879*c217d954SCole Faust
880*c217d954SCole Faust#define convert_float_sat convert_float
881*c217d954SCole Faust#define convert_float1_sat convert_float
882*c217d954SCole Faust#define convert_float2_sat convert_float2
883*c217d954SCole Faust#define convert_float3_sat convert_float3
884*c217d954SCole Faust#define convert_float4_sat convert_float4
885*c217d954SCole Faust#define convert_float8_sat convert_float8
886*c217d954SCole Faust#define convert_float16_sat convert_float16
887*c217d954SCole Faust#define convert_half_sat convert_float
888*c217d954SCole Faust#define convert_half1_sat convert_half
889*c217d954SCole Faust#define convert_half2_sat convert_half2
890*c217d954SCole Faust#define convert_half3_sat convert_half3
891*c217d954SCole Faust#define convert_half4_sat convert_half4
892*c217d954SCole Faust#define convert_half8_sat convert_half8
893*c217d954SCole Faust#define convert_half16_sat convert_half16
894*c217d954SCole Faust
895*c217d954SCole Faust#define convert_float1 convert_float
896*c217d954SCole Faust#define convert_half1 convert_half
897*c217d954SCole Faust#define convert_char1 convert_char
898*c217d954SCole Faust#define convert_uchar1 convert_uchar
899*c217d954SCole Faust#define convert_short1 convert_short
900*c217d954SCole Faust#define convert_ushort1 convert_ushort
901*c217d954SCole Faust#define convert_int1 convert_int
902*c217d954SCole Faust#define convert_uint1 convert_uint
903*c217d954SCole Faust#define convert_long1 convert_long
904*c217d954SCole Faust#define convert_ulong1 convert_ulong
905*c217d954SCole Faust#define convert_double1 convert_double
906*c217d954SCole Faust
907*c217d954SCole Faust#define convert_char1_sat convert_char_sat
908*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
909*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
910*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
911*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
912*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
913*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
914*c217d954SCole Faust#define convert_short1_sat convert_short_sat
915*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
916*c217d954SCole Faust#define convert_int1_sat convert_int_sat
917*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
918*c217d954SCole Faust#define convert_long1_sat convert_long_sat
919*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
920*c217d954SCole Faust#define convert_double1_sat convert_double_sat
921*c217d954SCole Faust
922*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
923*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
924*c217d954SCole Faust
925*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
926*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
927*c217d954SCole Faust
928*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
929*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
930*c217d954SCole Faust
931*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
932*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
933*c217d954SCole Faust
934*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
935*c217d954SCole Faust#define select_vec_dt_char(size) char##size
936*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
937*c217d954SCole Faust#define select_vec_dt_short(size) short##size
938*c217d954SCole Faust#define select_vec_dt_half(size) short##size
939*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
940*c217d954SCole Faust#define select_vec_dt_int(size) int##size
941*c217d954SCole Faust#define select_vec_dt_float(size) int##size
942*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
943*c217d954SCole Faust#define select_vec_dt_long(size) long##size
944*c217d954SCole Faust
945*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
946*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
947*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
948*c217d954SCole Faust
949*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
950*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
951*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
952*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
953*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
954*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
955*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
956*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
957*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
958*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
959*c217d954SCole Faust
960*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
961*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
962*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
963*c217d954SCole Faust
964*c217d954SCole Faust#define sum_reduce_1(x) (x)
965*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
966*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
967*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
968*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
969*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
970*c217d954SCole Faust
971*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
972*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
973*c217d954SCole Faust
974*c217d954SCole Faust#define prod_reduce_1(x) (x)
975*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
976*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
977*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
978*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
979*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
980*c217d954SCole Faust
981*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
982*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
983*c217d954SCole Faust
984*c217d954SCole Faust#define max_reduce_1(x) (x)
985*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
986*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
987*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
988*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
989*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
990*c217d954SCole Faust
991*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
992*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
993*c217d954SCole Faust
994*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
995*c217d954SCole Faust    __global uchar *name##_ptr,      \
996*c217d954SCole Faust    uint        name##_stride_x, \
997*c217d954SCole Faust    uint        name##_step_x,   \
998*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
999*c217d954SCole Faust
1000*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
1001*c217d954SCole Faust    __global uchar *name##_ptr,      \
1002*c217d954SCole Faust    uint        name##_stride_x, \
1003*c217d954SCole Faust    uint        name##_step_x,   \
1004*c217d954SCole Faust    uint        name##_stride_y, \
1005*c217d954SCole Faust    uint        name##_step_y,   \
1006*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1007*c217d954SCole Faust
1008*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
1009*c217d954SCole Faust    __global uchar *name##_ptr,      \
1010*c217d954SCole Faust    uint        name##_stride_x, \
1011*c217d954SCole Faust    uint        name##_step_x,   \
1012*c217d954SCole Faust    uint        name##_stride_y, \
1013*c217d954SCole Faust    uint        name##_step_y,   \
1014*c217d954SCole Faust    uint        name##_stride_z, \
1015*c217d954SCole Faust    uint        name##_step_z,   \
1016*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1017*c217d954SCole Faust
1018*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
1019*c217d954SCole Faust    __global uchar *name##_ptr,      \
1020*c217d954SCole Faust    uint        name##_stride_x, \
1021*c217d954SCole Faust    uint        name##_step_x,   \
1022*c217d954SCole Faust    uint        name##_stride_y, \
1023*c217d954SCole Faust    uint        name##_step_y,   \
1024*c217d954SCole Faust    uint        name##_stride_z, \
1025*c217d954SCole Faust    uint        name##_step_z,   \
1026*c217d954SCole Faust    uint        name##_stride_w, \
1027*c217d954SCole Faust    uint        name##_step_w,   \
1028*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1029*c217d954SCole Faust
1030*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
1031*c217d954SCole Faust    __global uchar *name##_ptr,      \
1032*c217d954SCole Faust    uint        name##_stride_x, \
1033*c217d954SCole Faust    uint        name##_step_x,   \
1034*c217d954SCole Faust    uint        name##_stride_y, \
1035*c217d954SCole Faust    uint        name##_step_y,   \
1036*c217d954SCole Faust    uint        name##_stride_z, \
1037*c217d954SCole Faust    uint        name##_step_z,   \
1038*c217d954SCole Faust    uint        name##_stride_w, \
1039*c217d954SCole Faust    uint        name##_step_w,   \
1040*c217d954SCole Faust    uint        name##_stride_v, \
1041*c217d954SCole Faust    uint        name##_step_v,   \
1042*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1043*c217d954SCole Faust
1044*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
1045*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1046*c217d954SCole Faust
1047*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1048*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1049*c217d954SCole Faust
1050*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
1051*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1052*c217d954SCole Faust
1053*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1054*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1055*c217d954SCole Faust
1056*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1057*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1058*c217d954SCole Faust
1059*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1060*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1061*c217d954SCole Faust
1062*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1063*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1064*c217d954SCole Faust
1065*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1066*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1067*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
1068*c217d954SCole Faust
1069*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1070*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1071*c217d954SCole Faust
1072*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1073*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1074*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1075*c217d954SCole Faust
1076*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1077*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1078*c217d954SCole Faust
1079*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1080*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1081*c217d954SCole Faust                           name##_stride_z, name##_step_z)
1082*c217d954SCole Faust
1083*c217d954SCole Faust
1084*c217d954SCole Fausttypedef struct Vector
1085*c217d954SCole Faust{
1086*c217d954SCole Faust    __global uchar *ptr;
1087*c217d954SCole Faust    int             offset_first_element_in_bytes;
1088*c217d954SCole Faust    int             stride_x;
1089*c217d954SCole Faust} Vector;
1090*c217d954SCole Faust
1091*c217d954SCole Faust
1092*c217d954SCole Fausttypedef struct Image
1093*c217d954SCole Faust{
1094*c217d954SCole Faust    __global uchar *ptr;
1095*c217d954SCole Faust    int             offset_first_element_in_bytes;
1096*c217d954SCole Faust    int             stride_x;
1097*c217d954SCole Faust    int             stride_y;
1098*c217d954SCole Faust} Image;
1099*c217d954SCole Faust
1100*c217d954SCole Faust
1101*c217d954SCole Fausttypedef struct Tensor3D
1102*c217d954SCole Faust{
1103*c217d954SCole Faust    __global uchar *ptr;
1104*c217d954SCole Faust    int             offset_first_element_in_bytes;
1105*c217d954SCole Faust    int             stride_x;
1106*c217d954SCole Faust    int             stride_y;
1107*c217d954SCole Faust    int             stride_z;
1108*c217d954SCole Faust} Tensor3D;
1109*c217d954SCole Faust
1110*c217d954SCole Faust
1111*c217d954SCole Fausttypedef struct Tensor4D
1112*c217d954SCole Faust{
1113*c217d954SCole Faust    __global uchar *ptr;
1114*c217d954SCole Faust    int             offset_first_element_in_bytes;
1115*c217d954SCole Faust    int             stride_x;
1116*c217d954SCole Faust    int             stride_y;
1117*c217d954SCole Faust    int             stride_z;
1118*c217d954SCole Faust    int             stride_w;
1119*c217d954SCole Faust} Tensor4D;
1120*c217d954SCole Faust
1121*c217d954SCole Faust
1122*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1123*c217d954SCole Faust{
1124*c217d954SCole Faust    Vector vector =
1125*c217d954SCole Faust    {
1126*c217d954SCole Faust        .ptr                           = ptr,
1127*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1128*c217d954SCole Faust        .stride_x                      = stride_x,
1129*c217d954SCole Faust    };
1130*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1131*c217d954SCole Faust    return vector;
1132*c217d954SCole Faust}
1133*c217d954SCole Faust
1134*c217d954SCole Faust
1135*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1136*c217d954SCole Faust{
1137*c217d954SCole Faust    Image img =
1138*c217d954SCole Faust    {
1139*c217d954SCole Faust        .ptr                           = ptr,
1140*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1141*c217d954SCole Faust        .stride_x                      = stride_x,
1142*c217d954SCole Faust        .stride_y                      = stride_y
1143*c217d954SCole Faust    };
1144*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1145*c217d954SCole Faust    return img;
1146*c217d954SCole Faust}
1147*c217d954SCole Faust
1148*c217d954SCole Faust
1149*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1150*c217d954SCole Faust{
1151*c217d954SCole Faust    Image img =
1152*c217d954SCole Faust    {
1153*c217d954SCole Faust        .ptr                           = ptr,
1154*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1155*c217d954SCole Faust        .stride_x                      = stride_x,
1156*c217d954SCole Faust        .stride_y                      = stride_y
1157*c217d954SCole Faust    };
1158*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1159*c217d954SCole Faust    return img;
1160*c217d954SCole Faust}
1161*c217d954SCole Faust
1162*c217d954SCole Faust
1163*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1164*c217d954SCole Faust{
1165*c217d954SCole Faust    Tensor3D tensor =
1166*c217d954SCole Faust    {
1167*c217d954SCole Faust        .ptr                           = ptr,
1168*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1169*c217d954SCole Faust        .stride_x                      = stride_x,
1170*c217d954SCole Faust        .stride_y                      = stride_y,
1171*c217d954SCole Faust        .stride_z                      = stride_z
1172*c217d954SCole Faust    };
1173*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1174*c217d954SCole Faust    return tensor;
1175*c217d954SCole Faust}
1176*c217d954SCole Faust
1177*c217d954SCole Faust
1178*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1179*c217d954SCole Faust{
1180*c217d954SCole Faust    Tensor3D tensor =
1181*c217d954SCole Faust    {
1182*c217d954SCole Faust        .ptr                           = ptr,
1183*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1184*c217d954SCole Faust        .stride_x                      = stride_x,
1185*c217d954SCole Faust        .stride_y                      = stride_y,
1186*c217d954SCole Faust        .stride_z                      = stride_z
1187*c217d954SCole Faust    };
1188*c217d954SCole Faust    return tensor;
1189*c217d954SCole Faust}
1190*c217d954SCole Faust
1191*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1192*c217d954SCole Faust                                             uint step_w,
1193*c217d954SCole Faust                                             uint mod_size)
1194*c217d954SCole Faust{
1195*c217d954SCole Faust    Tensor4D tensor =
1196*c217d954SCole Faust    {
1197*c217d954SCole Faust        .ptr                           = ptr,
1198*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1199*c217d954SCole Faust        .stride_x                      = stride_x,
1200*c217d954SCole Faust        .stride_y                      = stride_y,
1201*c217d954SCole Faust        .stride_z                      = stride_z,
1202*c217d954SCole Faust        .stride_w                      = stride_w
1203*c217d954SCole Faust    };
1204*c217d954SCole Faust
1205*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1206*c217d954SCole Faust    return tensor;
1207*c217d954SCole Faust}
1208*c217d954SCole Faust
1209*c217d954SCole Faust
1210*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
1211*c217d954SCole Faust{
1212*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
1213*c217d954SCole Faust}
1214*c217d954SCole Faust
1215*c217d954SCole Faust
1216*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
1217*c217d954SCole Faust{
1218*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
1219*c217d954SCole Faust}
1220*c217d954SCole Faust
1221*c217d954SCole Faust
1222*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1223*c217d954SCole Faust{
1224*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1225*c217d954SCole Faust}
1226*c217d954SCole Faust
1227*c217d954SCole Faust
1228*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1229*c217d954SCole Faust{
1230*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1231*c217d954SCole Faust}
1232*c217d954SCole Faust
1233*c217d954SCole Faust
1234*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1235*c217d954SCole Faust{
1236*c217d954SCole Faust    uint num_elements = width * height;
1237*c217d954SCole Faust
1238*c217d954SCole Faust    const uint z = index / num_elements;
1239*c217d954SCole Faust
1240*c217d954SCole Faust    index %= num_elements;
1241*c217d954SCole Faust
1242*c217d954SCole Faust    const uint y = index / width;
1243*c217d954SCole Faust
1244*c217d954SCole Faust    index %= width;
1245*c217d954SCole Faust
1246*c217d954SCole Faust    const uint x = index;
1247*c217d954SCole Faust
1248*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1249*c217d954SCole Faust}
1250*c217d954SCole Faust
1251*c217d954SCole Faust#endif
1252*c217d954SCole Faust
1253*c217d954SCole Faust#if GPU_ARCH == GPU_ARCH_BIFROST
1254*c217d954SCole Faust#define MLA(a, b, c) (fma(c, b, a))
1255*c217d954SCole Faust#else
1256*c217d954SCole Faust#define MLA(a, b, c) ((b) * (c) + (a))
1257*c217d954SCole Faust#endif
1258*c217d954SCole Faust
1259*c217d954SCole Faust
1260*c217d954SCole Faust#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
1261*c217d954SCole Faust
1262*c217d954SCole Faust
1263*c217d954SCole Faust#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
1264*c217d954SCole Faust
1265*c217d954SCole Faust
1266*c217d954SCole Faust#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
1267*c217d954SCole Faust
1268*c217d954SCole Faust
1269*c217d954SCole Faust#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
1270*c217d954SCole Faust
1271*c217d954SCole Faust
1272*c217d954SCole Faust#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
1273*c217d954SCole Faust
1274*c217d954SCole Faust
1275*c217d954SCole Faust#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1276*c217d954SCole Faust
1277*c217d954SCole Faust
1278*c217d954SCole Faust#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
1279*c217d954SCole Faust
1280*c217d954SCole Faust
1281*c217d954SCole Faust#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
1282*c217d954SCole Faust
1283*c217d954SCole Faust
1284*c217d954SCole Faust#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
1285*c217d954SCole Faust
1286*c217d954SCole Faust
1287*c217d954SCole Faust#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
1288*c217d954SCole Faust
1289*c217d954SCole Faust
1290*c217d954SCole Faust#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
1291*c217d954SCole Faust
1292*c217d954SCole Faust
1293*c217d954SCole Faust#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
1294*c217d954SCole Faust
1295*c217d954SCole Faust
1296*c217d954SCole Faust#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
1297*c217d954SCole Faust
1298*c217d954SCole Faust
1299*c217d954SCole Faust#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
1300*c217d954SCole Faust
1301*c217d954SCole Faust
1302*c217d954SCole Faust#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
1303*c217d954SCole Faust
1304*c217d954SCole Faust#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1305*c217d954SCole Faust
1306*c217d954SCole Faust#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1307*c217d954SCole Faust
1308*c217d954SCole Faust
1309*c217d954SCole Faust__kernel void activation_layer(
1310*c217d954SCole Faust    TENSOR3D_DECLARATION(input)
1311*c217d954SCole Faust#ifndef IN_PLACE
1312*c217d954SCole Faust    ,
1313*c217d954SCole Faust    TENSOR3D_DECLARATION(output)
1314*c217d954SCole Faust#endif
1315*c217d954SCole Faust)
1316*c217d954SCole Faust{
1317*c217d954SCole Faust    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
1318*c217d954SCole Faust
1319*c217d954SCole Faust
1320*c217d954SCole Faust    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
1321*c217d954SCole Faust#ifdef IN_PLACE
1322*c217d954SCole Faust    __global uchar *output_addr = input_addr;
1323*c217d954SCole Faust#else
1324*c217d954SCole Faust    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
1325*c217d954SCole Faust#endif
1326*c217d954SCole Faust
1327*c217d954SCole Faust
1328*c217d954SCole Faust    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
1329*c217d954SCole Faust
1330*c217d954SCole Faust
1331*c217d954SCole Faust    data0 = ACTIVATION(ACT, DATA_TYPE, VEC_SIZE, data0, A_VAL, B_VAL);
1332*c217d954SCole Faust
1333*c217d954SCole Faust
1334*c217d954SCole Faust    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
1335*c217d954SCole Faust}
1336*c217d954SCole Faust
1337*c217d954SCole Faust#endif  )"