xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/common/gemm_utils.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1*c217d954SCole FaustR"(
2*c217d954SCole Faust
3*c217d954SCole Faust
4*c217d954SCole Faust
5*c217d954SCole Faust
6*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
7*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
8*c217d954SCole Faust
9*c217d954SCole Faust
10*c217d954SCole Faust
11*c217d954SCole Faust
12*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
13*c217d954SCole Faust    VSTORE(N0)                                                 \
14*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
15*c217d954SCole Faust
16*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
17*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
18*c217d954SCole Faust    VSTORE(N0)                                                 \
19*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
20*c217d954SCole Faust
21*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
22*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
23*c217d954SCole Faust    VSTORE(N0)                                                 \
24*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
25*c217d954SCole Faust
26*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
27*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
28*c217d954SCole Faust    VSTORE(N0)                                                 \
29*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
30*c217d954SCole Faust
31*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
32*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
33*c217d954SCole Faust    VSTORE(N0)                                                 \
34*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
35*c217d954SCole Faust
36*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
37*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
38*c217d954SCole Faust    VSTORE(N0)                                                 \
39*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
40*c217d954SCole Faust
41*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
42*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
43*c217d954SCole Faust    VSTORE(N0)                                                 \
44*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
45*c217d954SCole Faust
46*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
47*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
48*c217d954SCole Faust    VSTORE(N0)                                                 \
49*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
50*c217d954SCole Faust
51*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
52*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
53*c217d954SCole Faust    VSTORE(N0)                                                 \
54*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
55*c217d954SCole Faust
56*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
57*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
58*c217d954SCole Faust    VSTORE(N0)                                                  \
59*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
60*c217d954SCole Faust
61*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
62*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
63*c217d954SCole Faust    VSTORE(N0)                                                  \
64*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
65*c217d954SCole Faust
66*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
67*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
68*c217d954SCole Faust    VSTORE(N0)                                                  \
69*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
70*c217d954SCole Faust
71*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
72*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
73*c217d954SCole Faust    VSTORE(N0)                                                  \
74*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
75*c217d954SCole Faust
76*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
77*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
78*c217d954SCole Faust    VSTORE(N0)                                                  \
79*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
80*c217d954SCole Faust
81*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
82*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
83*c217d954SCole Faust    VSTORE(N0)                                                  \
84*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
85*c217d954SCole Faust
86*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
87*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
88*c217d954SCole Faust    VSTORE(N0)                                                  \
89*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
90*c217d954SCole Faust
91*c217d954SCole Faust
92*c217d954SCole Faust
93*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
94*c217d954SCole Faust    VSTORE(N0)                                                         \
95*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
96*c217d954SCole Faust
97*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
98*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
99*c217d954SCole Faust    VSTORE(N0)                                                         \
100*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
101*c217d954SCole Faust
102*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
103*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
104*c217d954SCole Faust    VSTORE(N0)                                                         \
105*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
106*c217d954SCole Faust
107*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
108*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
109*c217d954SCole Faust    VSTORE(N0)                                                         \
110*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
111*c217d954SCole Faust
112*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
113*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
114*c217d954SCole Faust    VSTORE(N0)                                                         \
115*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
116*c217d954SCole Faust
117*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
118*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
119*c217d954SCole Faust    VSTORE(N0)                                                         \
120*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
121*c217d954SCole Faust
122*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
123*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
124*c217d954SCole Faust    VSTORE(N0)                                                         \
125*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
126*c217d954SCole Faust
127*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
128*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
129*c217d954SCole Faust    VSTORE(N0)                                                         \
130*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
131*c217d954SCole Faust
132*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
133*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
134*c217d954SCole Faust    VSTORE(N0)                                                         \
135*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
136*c217d954SCole Faust
137*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
138*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
139*c217d954SCole Faust    VSTORE(N0)                                                     \
140*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
141*c217d954SCole Faust
142*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
143*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
144*c217d954SCole Faust    VSTORE(N0)                                                          \
145*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
146*c217d954SCole Faust
147*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
148*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
149*c217d954SCole Faust    VSTORE(N0)                                                          \
150*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
151*c217d954SCole Faust
152*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
153*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
154*c217d954SCole Faust    VSTORE(N0)                                                          \
155*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156*c217d954SCole Faust
157*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
158*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
159*c217d954SCole Faust    VSTORE(N0)                                                          \
160*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
161*c217d954SCole Faust
162*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
163*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
164*c217d954SCole Faust    VSTORE(N0)                                                          \
165*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
166*c217d954SCole Faust
167*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
168*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
169*c217d954SCole Faust    VSTORE(N0)                                                          \
170*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
171*c217d954SCole Faust
172*c217d954SCole Faust
173*c217d954SCole Faust
174*c217d954SCole Faust
175*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
176*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
177*c217d954SCole Faust
178*c217d954SCole Faust
179*c217d954SCole Faust
180*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
181*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
182*c217d954SCole Faust
183*c217d954SCole Faust
184*c217d954SCole Faust
185*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
186*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
187*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
188*c217d954SCole Faust
189*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
190*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
191*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
192*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
193*c217d954SCole Faust
194*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
195*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
196*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
197*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
198*c217d954SCole Faust
199*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
200*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
201*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
202*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
203*c217d954SCole Faust
204*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
205*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
206*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
207*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
208*c217d954SCole Faust
209*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
210*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
211*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
212*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
213*c217d954SCole Faust
214*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
215*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
216*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
217*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
218*c217d954SCole Faust
219*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
220*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
221*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
222*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
223*c217d954SCole Faust
224*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
225*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
226*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
227*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
228*c217d954SCole Faust
229*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
230*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
231*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
232*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
233*c217d954SCole Faust
234*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
235*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
236*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
237*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
238*c217d954SCole Faust
239*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
240*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
241*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
242*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
243*c217d954SCole Faust
244*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
245*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
246*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
247*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
248*c217d954SCole Faust
249*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
250*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
251*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
252*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
253*c217d954SCole Faust
254*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
255*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
256*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
257*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
258*c217d954SCole Faust
259*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
260*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
261*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
262*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
263*c217d954SCole Faust
264*c217d954SCole Faust
265*c217d954SCole Faust
266*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
267*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
268*c217d954SCole Faust
269*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
270*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
271*c217d954SCole Faust    {                                                                                                                                                     \
272*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
273*c217d954SCole Faust    }                                                                                                                                                     \
274*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
275*c217d954SCole Faust    {                                                                                                                                                     \
276*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
277*c217d954SCole Faust    }                                                                                                                                                     \
278*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
279*c217d954SCole Faust    {                                                                                                                                                     \
280*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
281*c217d954SCole Faust    }                                                                                                                                                     \
282*c217d954SCole Faust    else                                                                                                                                                  \
283*c217d954SCole Faust    {                                                                                                                                                     \
284*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
285*c217d954SCole Faust    }
286*c217d954SCole Faust
287*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
288*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
289*c217d954SCole Faust    {                                                                                                             \
290*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
291*c217d954SCole Faust    }                                                                                                             \
292*c217d954SCole Faust    else                                                                                                          \
293*c217d954SCole Faust    {                                                                                                             \
294*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
295*c217d954SCole Faust    }
296*c217d954SCole Faust
297*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
298*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
299*c217d954SCole Faust    {                                                                                                             \
300*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
301*c217d954SCole Faust    }                                                                                                             \
302*c217d954SCole Faust    else                                                                                                          \
303*c217d954SCole Faust    {                                                                                                             \
304*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
305*c217d954SCole Faust    }
306*c217d954SCole Faust
307*c217d954SCole Faust
308*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
309*c217d954SCole Faust
310*c217d954SCole Faust
311*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
312*c217d954SCole Faust
313*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
314*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
315*c217d954SCole Faust
316*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
317*c217d954SCole Faust
318*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
319*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
320*c217d954SCole Faust
321*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
322*c217d954SCole Faust
323*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
324*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
325*c217d954SCole Faust
326*c217d954SCole Faust#else
327*c217d954SCole Faust
328*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
329*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
330*c217d954SCole Faust
331*c217d954SCole Faust#endif
332*c217d954SCole Faust
333*c217d954SCole Faust#endif
334*c217d954SCole Faust
335*c217d954SCole Faust
336*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
337*c217d954SCole Faust
338*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
340*c217d954SCole Faust#else
341*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
342*c217d954SCole Faust    ((uint)(y * M0))
343*c217d954SCole Faust#endif
344*c217d954SCole Faust
345*c217d954SCole Faust
346*c217d954SCole Faust
347*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
348*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
349*c217d954SCole Faust
350*c217d954SCole Faust
351*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
352*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
353*c217d954SCole Faust#endif
354*c217d954SCole Faust
355*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
356*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
357*c217d954SCole Faust#endif
358*c217d954SCole Faust
359*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
360*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
361*c217d954SCole Faust#endif
362*c217d954SCole Faust
363*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
364*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
365*c217d954SCole Faust#endif
366*c217d954SCole Faust
367*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
368*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
369*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
370*c217d954SCole Faust
371*c217d954SCole Faust
372*c217d954SCole Faust#define CONCAT(a, b) a##b
373*c217d954SCole Faust
374*c217d954SCole Faust
375*c217d954SCole Faust#define EXPAND(x) x
376*c217d954SCole Faust
377*c217d954SCole Faust
378*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
379*c217d954SCole Faust
380*c217d954SCole Faust
381*c217d954SCole Faust#define REV1(x) ((x))
382*c217d954SCole Faust#define REV2(x) ((x).s10)
383*c217d954SCole Faust#define REV3(x) ((x).s210)
384*c217d954SCole Faust#define REV4(x) ((x).s3210)
385*c217d954SCole Faust#define REV8(x) ((x).s76543210)
386*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
387*c217d954SCole Faust
388*c217d954SCole Faust
389*c217d954SCole Faust
390*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
391*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
392*c217d954SCole Faust
393*c217d954SCole Faust
394*c217d954SCole Faust
395*c217d954SCole Faust#define ROT1_0(x) ((x))
396*c217d954SCole Faust#define ROT1_1(x) ((x))
397*c217d954SCole Faust
398*c217d954SCole Faust#define ROT2_0(x) ((x))
399*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
400*c217d954SCole Faust#define ROT2_2(x) ((x))
401*c217d954SCole Faust
402*c217d954SCole Faust#define ROT3_0(x) ((x))
403*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
404*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
405*c217d954SCole Faust#define ROT3_3(x) ((x))
406*c217d954SCole Faust
407*c217d954SCole Faust#define ROT4_0(x) ((x))
408*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
409*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
410*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
411*c217d954SCole Faust#define ROT4_4(x) ((x))
412*c217d954SCole Faust
413*c217d954SCole Faust#define ROT8_0(x) ((x))
414*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
415*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
416*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
417*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
418*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
419*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
420*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
421*c217d954SCole Faust#define ROT8_8(x) ((x))
422*c217d954SCole Faust
423*c217d954SCole Faust#define ROT16_0(x) ((x))
424*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
425*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
426*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
427*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
428*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
429*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
430*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
431*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
432*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
433*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
434*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
435*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
436*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
437*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
438*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
439*c217d954SCole Faust#define ROT16_16(x) ((x))
440*c217d954SCole Faust
441*c217d954SCole Faust
442*c217d954SCole Faust
443*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
444*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
445*c217d954SCole Faust
446*c217d954SCole Faust
447*c217d954SCole Faust
448*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
449*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
450*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
451*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
452*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
453*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
454*c217d954SCole Faust
455*c217d954SCole Faust
456*c217d954SCole Faust
457*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
458*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
459*c217d954SCole Faust
460*c217d954SCole Faust
461*c217d954SCole Faust#define VLOAD_STR(size) vload##size
462*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
463*c217d954SCole Faust
464*c217d954SCole Faust
465*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
466*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
467*c217d954SCole Faust
468*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
469*c217d954SCole Faust    {                            \
470*c217d954SCole Faust    }
471*c217d954SCole Faust
472*c217d954SCole Faust
473*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
474*c217d954SCole Faust#define vload_partial_1_1 vload1
475*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
476*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
477*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
478*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
479*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
480*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
481*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
482*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
483*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
484*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
485*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
486*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
487*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
488*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
489*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
490*c217d954SCole Faust
491*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
492*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
493*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
494*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
495*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
496*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
497*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
498*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
499*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
500*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
501*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
502*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
503*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
504*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
505*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
506*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
507*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
508*c217d954SCole Faust
509*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
510*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
511*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
512*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
513*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
514*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
515*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
516*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
517*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
518*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
519*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
520*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
521*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
522*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
523*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
524*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
525*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
526*c217d954SCole Faust
527*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
528*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
529*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
530*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
531*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
532*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
533*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
534*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
535*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
536*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
537*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
538*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
539*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
540*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
541*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
542*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
543*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
544*c217d954SCole Faust
545*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
546*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
547*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
548*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
549*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
550*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
551*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
552*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
553*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
554*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
555*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
556*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
557*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
558*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
559*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
560*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
561*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
562*c217d954SCole Faust
563*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
564*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
565*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
566*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
567*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
568*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
569*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
570*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
571*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
572*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
573*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
574*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
575*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
576*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
577*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
578*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
579*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
580*c217d954SCole Faust
581*c217d954SCole Faust
582*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
583*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
584*c217d954SCole Faust
585*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
586*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
587*c217d954SCole Faust
588*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
589*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
590*c217d954SCole Faust
591*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
592*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
593*c217d954SCole Faust
594*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
595*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
596*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
597*c217d954SCole Faust
598*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
599*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
600*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
601*c217d954SCole Faust
602*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
603*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
604*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
605*c217d954SCole Faust
606*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
607*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
608*c217d954SCole Faust
609*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
610*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
611*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
612*c217d954SCole Faust
613*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
614*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
615*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
616*c217d954SCole Faust
617*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
618*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
619*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
620*c217d954SCole Faust
621*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
622*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
623*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
624*c217d954SCole Faust
625*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
626*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
627*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
628*c217d954SCole Faust
629*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
630*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
631*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
632*c217d954SCole Faust
633*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
634*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
635*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
636*c217d954SCole Faust
637*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
638*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
639*c217d954SCole Faust
640*c217d954SCole Faust
641*c217d954SCole Faust
642*c217d954SCole Faust#define PIXEL_UNIT4 1
643*c217d954SCole Faust#define PIXEL_UNIT8 2
644*c217d954SCole Faust#define PIXEL_UNIT16 4
645*c217d954SCole Faust
646*c217d954SCole Faust
647*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
648*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
649*c217d954SCole Faust
650*c217d954SCole Faust
651*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
652*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
653*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
654*c217d954SCole Faust
655*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
656*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
657*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
658*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
659*c217d954SCole Faust#endif
660*c217d954SCole Faust
661*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
662*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
663*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
664*c217d954SCole Faust
665*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
666*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
667*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
668*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
669*c217d954SCole Faust#endif
670*c217d954SCole Faust
671*c217d954SCole Faust
672*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
673*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
674*c217d954SCole Faust
675*c217d954SCole Faust
676*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
677*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
678*c217d954SCole Faust
679*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
680*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
681*c217d954SCole Faust
682*c217d954SCole Faust#define float1 float
683*c217d954SCole Faust#define half1 half
684*c217d954SCole Faust#define char1 char
685*c217d954SCole Faust#define uchar1 uchar
686*c217d954SCole Faust#define short1 short
687*c217d954SCole Faust#define ushort1 ushort
688*c217d954SCole Faust#define int1 int
689*c217d954SCole Faust#define uint1 uint
690*c217d954SCole Faust#define long1 long
691*c217d954SCole Faust#define ulong1 ulong
692*c217d954SCole Faust#define double1 double
693*c217d954SCole Faust
694*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
695*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
696*c217d954SCole Faust
697*c217d954SCole Faust
698*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
699*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
700*c217d954SCole Faust
701*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
702*c217d954SCole Faust    {                             \
703*c217d954SCole Faust    }
704*c217d954SCole Faust
705*c217d954SCole Faust
706*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
707*c217d954SCole Faust#define vstore_partial_1_1 vstore1
708*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
709*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
710*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
711*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
712*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
713*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
714*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
715*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
716*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
717*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
718*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
719*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
720*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
721*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
722*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
723*c217d954SCole Faust
724*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
725*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
726*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
727*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
728*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
729*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
730*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
731*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
732*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
733*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
734*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
735*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
736*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
737*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
738*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
739*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
740*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
741*c217d954SCole Faust
742*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
743*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
744*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
745*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
746*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
747*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
748*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
749*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
750*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
751*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
752*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
753*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
754*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
755*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
756*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
757*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
758*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
759*c217d954SCole Faust
760*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
761*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
762*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
763*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
764*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
765*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
766*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
767*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
768*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
769*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
770*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
771*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
772*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
773*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
774*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
775*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
776*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
777*c217d954SCole Faust
778*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
779*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
780*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
781*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
782*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
783*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
784*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
785*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
786*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
787*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
788*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
789*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
790*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
791*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
792*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
793*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
794*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
795*c217d954SCole Faust
796*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
797*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
798*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
799*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
800*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
801*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
802*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
803*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
804*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
805*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
806*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
807*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
808*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
809*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
810*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
811*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
812*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
813*c217d954SCole Faust
814*c217d954SCole Faust
815*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
816*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
817*c217d954SCole Faust
818*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
819*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
820*c217d954SCole Faust
821*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
822*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
823*c217d954SCole Faust
824*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
825*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
826*c217d954SCole Faust
827*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
828*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
829*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
830*c217d954SCole Faust
831*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
832*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
833*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
834*c217d954SCole Faust
835*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
836*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
837*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
838*c217d954SCole Faust
839*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
840*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
841*c217d954SCole Faust
842*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
843*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
844*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
845*c217d954SCole Faust
846*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
847*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
848*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
849*c217d954SCole Faust
850*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
851*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
852*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
853*c217d954SCole Faust
854*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
855*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
856*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
857*c217d954SCole Faust
858*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
859*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
860*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
861*c217d954SCole Faust
862*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
863*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
864*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
865*c217d954SCole Faust
866*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
867*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
868*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
869*c217d954SCole Faust
870*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
871*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
872*c217d954SCole Faust
873*c217d954SCole Faust
874*c217d954SCole Faust
875*c217d954SCole Faust
876*c217d954SCole Faust
877*c217d954SCole Faust#define convert_float_sat convert_float
878*c217d954SCole Faust#define convert_float1_sat convert_float
879*c217d954SCole Faust#define convert_float2_sat convert_float2
880*c217d954SCole Faust#define convert_float3_sat convert_float3
881*c217d954SCole Faust#define convert_float4_sat convert_float4
882*c217d954SCole Faust#define convert_float8_sat convert_float8
883*c217d954SCole Faust#define convert_float16_sat convert_float16
884*c217d954SCole Faust#define convert_half_sat convert_float
885*c217d954SCole Faust#define convert_half1_sat convert_half
886*c217d954SCole Faust#define convert_half2_sat convert_half2
887*c217d954SCole Faust#define convert_half3_sat convert_half3
888*c217d954SCole Faust#define convert_half4_sat convert_half4
889*c217d954SCole Faust#define convert_half8_sat convert_half8
890*c217d954SCole Faust#define convert_half16_sat convert_half16
891*c217d954SCole Faust
892*c217d954SCole Faust#define convert_float1 convert_float
893*c217d954SCole Faust#define convert_half1 convert_half
894*c217d954SCole Faust#define convert_char1 convert_char
895*c217d954SCole Faust#define convert_uchar1 convert_uchar
896*c217d954SCole Faust#define convert_short1 convert_short
897*c217d954SCole Faust#define convert_ushort1 convert_ushort
898*c217d954SCole Faust#define convert_int1 convert_int
899*c217d954SCole Faust#define convert_uint1 convert_uint
900*c217d954SCole Faust#define convert_long1 convert_long
901*c217d954SCole Faust#define convert_ulong1 convert_ulong
902*c217d954SCole Faust#define convert_double1 convert_double
903*c217d954SCole Faust
904*c217d954SCole Faust#define convert_char1_sat convert_char_sat
905*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
906*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
907*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
908*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
909*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
910*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
911*c217d954SCole Faust#define convert_short1_sat convert_short_sat
912*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
913*c217d954SCole Faust#define convert_int1_sat convert_int_sat
914*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
915*c217d954SCole Faust#define convert_long1_sat convert_long_sat
916*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
917*c217d954SCole Faust#define convert_double1_sat convert_double_sat
918*c217d954SCole Faust
919*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
920*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
921*c217d954SCole Faust
922*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
923*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
924*c217d954SCole Faust
925*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
926*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
927*c217d954SCole Faust
928*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
929*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
930*c217d954SCole Faust
931*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
932*c217d954SCole Faust#define select_vec_dt_char(size) char##size
933*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
934*c217d954SCole Faust#define select_vec_dt_short(size) short##size
935*c217d954SCole Faust#define select_vec_dt_half(size) short##size
936*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
937*c217d954SCole Faust#define select_vec_dt_int(size) int##size
938*c217d954SCole Faust#define select_vec_dt_float(size) int##size
939*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
940*c217d954SCole Faust#define select_vec_dt_long(size) long##size
941*c217d954SCole Faust
942*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
943*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
944*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
945*c217d954SCole Faust
946*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
947*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
948*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
949*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
950*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
951*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
952*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
953*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
954*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
955*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
956*c217d954SCole Faust
957*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
958*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
959*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
960*c217d954SCole Faust
961*c217d954SCole Faust#define sum_reduce_1(x) (x)
962*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
963*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
964*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
965*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
966*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
967*c217d954SCole Faust
968*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
969*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
970*c217d954SCole Faust
971*c217d954SCole Faust#define prod_reduce_1(x) (x)
972*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
973*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
974*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
975*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
976*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
977*c217d954SCole Faust
978*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
979*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
980*c217d954SCole Faust
981*c217d954SCole Faust#define max_reduce_1(x) (x)
982*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
983*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
984*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
985*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
986*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
987*c217d954SCole Faust
988*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
989*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
990*c217d954SCole Faust
991*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
992*c217d954SCole Faust    __global uchar *name##_ptr,      \
993*c217d954SCole Faust    uint        name##_stride_x, \
994*c217d954SCole Faust    uint        name##_step_x,   \
995*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
996*c217d954SCole Faust
997*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
998*c217d954SCole Faust    __global uchar *name##_ptr,      \
999*c217d954SCole Faust    uint        name##_stride_x, \
1000*c217d954SCole Faust    uint        name##_step_x,   \
1001*c217d954SCole Faust    uint        name##_stride_y, \
1002*c217d954SCole Faust    uint        name##_step_y,   \
1003*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1004*c217d954SCole Faust
1005*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
1006*c217d954SCole Faust    __global uchar *name##_ptr,      \
1007*c217d954SCole Faust    uint        name##_stride_x, \
1008*c217d954SCole Faust    uint        name##_step_x,   \
1009*c217d954SCole Faust    uint        name##_stride_y, \
1010*c217d954SCole Faust    uint        name##_step_y,   \
1011*c217d954SCole Faust    uint        name##_stride_z, \
1012*c217d954SCole Faust    uint        name##_step_z,   \
1013*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1014*c217d954SCole Faust
1015*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
1016*c217d954SCole Faust    __global uchar *name##_ptr,      \
1017*c217d954SCole Faust    uint        name##_stride_x, \
1018*c217d954SCole Faust    uint        name##_step_x,   \
1019*c217d954SCole Faust    uint        name##_stride_y, \
1020*c217d954SCole Faust    uint        name##_step_y,   \
1021*c217d954SCole Faust    uint        name##_stride_z, \
1022*c217d954SCole Faust    uint        name##_step_z,   \
1023*c217d954SCole Faust    uint        name##_stride_w, \
1024*c217d954SCole Faust    uint        name##_step_w,   \
1025*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1026*c217d954SCole Faust
1027*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
1028*c217d954SCole Faust    __global uchar *name##_ptr,      \
1029*c217d954SCole Faust    uint        name##_stride_x, \
1030*c217d954SCole Faust    uint        name##_step_x,   \
1031*c217d954SCole Faust    uint        name##_stride_y, \
1032*c217d954SCole Faust    uint        name##_step_y,   \
1033*c217d954SCole Faust    uint        name##_stride_z, \
1034*c217d954SCole Faust    uint        name##_step_z,   \
1035*c217d954SCole Faust    uint        name##_stride_w, \
1036*c217d954SCole Faust    uint        name##_step_w,   \
1037*c217d954SCole Faust    uint        name##_stride_v, \
1038*c217d954SCole Faust    uint        name##_step_v,   \
1039*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1040*c217d954SCole Faust
1041*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
1042*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1043*c217d954SCole Faust
1044*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1045*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1046*c217d954SCole Faust
1047*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
1048*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1049*c217d954SCole Faust
1050*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1051*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1052*c217d954SCole Faust
1053*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1054*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1055*c217d954SCole Faust
1056*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1057*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1058*c217d954SCole Faust
1059*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1060*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1061*c217d954SCole Faust
1062*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1063*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1064*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
1065*c217d954SCole Faust
1066*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1067*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1068*c217d954SCole Faust
1069*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1070*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1071*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1072*c217d954SCole Faust
1073*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1074*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1075*c217d954SCole Faust
1076*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1077*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1078*c217d954SCole Faust                           name##_stride_z, name##_step_z)
1079*c217d954SCole Faust
1080*c217d954SCole Faust
1081*c217d954SCole Fausttypedef struct Vector
1082*c217d954SCole Faust{
1083*c217d954SCole Faust    __global uchar *ptr;
1084*c217d954SCole Faust    int             offset_first_element_in_bytes;
1085*c217d954SCole Faust    int             stride_x;
1086*c217d954SCole Faust} Vector;
1087*c217d954SCole Faust
1088*c217d954SCole Faust
1089*c217d954SCole Fausttypedef struct Image
1090*c217d954SCole Faust{
1091*c217d954SCole Faust    __global uchar *ptr;
1092*c217d954SCole Faust    int             offset_first_element_in_bytes;
1093*c217d954SCole Faust    int             stride_x;
1094*c217d954SCole Faust    int             stride_y;
1095*c217d954SCole Faust} Image;
1096*c217d954SCole Faust
1097*c217d954SCole Faust
1098*c217d954SCole Fausttypedef struct Tensor3D
1099*c217d954SCole Faust{
1100*c217d954SCole Faust    __global uchar *ptr;
1101*c217d954SCole Faust    int             offset_first_element_in_bytes;
1102*c217d954SCole Faust    int             stride_x;
1103*c217d954SCole Faust    int             stride_y;
1104*c217d954SCole Faust    int             stride_z;
1105*c217d954SCole Faust} Tensor3D;
1106*c217d954SCole Faust
1107*c217d954SCole Faust
1108*c217d954SCole Fausttypedef struct Tensor4D
1109*c217d954SCole Faust{
1110*c217d954SCole Faust    __global uchar *ptr;
1111*c217d954SCole Faust    int             offset_first_element_in_bytes;
1112*c217d954SCole Faust    int             stride_x;
1113*c217d954SCole Faust    int             stride_y;
1114*c217d954SCole Faust    int             stride_z;
1115*c217d954SCole Faust    int             stride_w;
1116*c217d954SCole Faust} Tensor4D;
1117*c217d954SCole Faust
1118*c217d954SCole Faust
1119*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1120*c217d954SCole Faust{
1121*c217d954SCole Faust    Vector vector =
1122*c217d954SCole Faust    {
1123*c217d954SCole Faust        .ptr                           = ptr,
1124*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1125*c217d954SCole Faust        .stride_x                      = stride_x,
1126*c217d954SCole Faust    };
1127*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1128*c217d954SCole Faust    return vector;
1129*c217d954SCole Faust}
1130*c217d954SCole Faust
1131*c217d954SCole Faust
1132*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1133*c217d954SCole Faust{
1134*c217d954SCole Faust    Image img =
1135*c217d954SCole Faust    {
1136*c217d954SCole Faust        .ptr                           = ptr,
1137*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1138*c217d954SCole Faust        .stride_x                      = stride_x,
1139*c217d954SCole Faust        .stride_y                      = stride_y
1140*c217d954SCole Faust    };
1141*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1142*c217d954SCole Faust    return img;
1143*c217d954SCole Faust}
1144*c217d954SCole Faust
1145*c217d954SCole Faust
1146*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1147*c217d954SCole Faust{
1148*c217d954SCole Faust    Image img =
1149*c217d954SCole Faust    {
1150*c217d954SCole Faust        .ptr                           = ptr,
1151*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1152*c217d954SCole Faust        .stride_x                      = stride_x,
1153*c217d954SCole Faust        .stride_y                      = stride_y
1154*c217d954SCole Faust    };
1155*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1156*c217d954SCole Faust    return img;
1157*c217d954SCole Faust}
1158*c217d954SCole Faust
1159*c217d954SCole Faust
1160*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1161*c217d954SCole Faust{
1162*c217d954SCole Faust    Tensor3D tensor =
1163*c217d954SCole Faust    {
1164*c217d954SCole Faust        .ptr                           = ptr,
1165*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1166*c217d954SCole Faust        .stride_x                      = stride_x,
1167*c217d954SCole Faust        .stride_y                      = stride_y,
1168*c217d954SCole Faust        .stride_z                      = stride_z
1169*c217d954SCole Faust    };
1170*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1171*c217d954SCole Faust    return tensor;
1172*c217d954SCole Faust}
1173*c217d954SCole Faust
1174*c217d954SCole Faust
1175*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1176*c217d954SCole Faust{
1177*c217d954SCole Faust    Tensor3D tensor =
1178*c217d954SCole Faust    {
1179*c217d954SCole Faust        .ptr                           = ptr,
1180*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1181*c217d954SCole Faust        .stride_x                      = stride_x,
1182*c217d954SCole Faust        .stride_y                      = stride_y,
1183*c217d954SCole Faust        .stride_z                      = stride_z
1184*c217d954SCole Faust    };
1185*c217d954SCole Faust    return tensor;
1186*c217d954SCole Faust}
1187*c217d954SCole Faust
1188*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1189*c217d954SCole Faust                                             uint step_w,
1190*c217d954SCole Faust                                             uint mod_size)
1191*c217d954SCole Faust{
1192*c217d954SCole Faust    Tensor4D tensor =
1193*c217d954SCole Faust    {
1194*c217d954SCole Faust        .ptr                           = ptr,
1195*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1196*c217d954SCole Faust        .stride_x                      = stride_x,
1197*c217d954SCole Faust        .stride_y                      = stride_y,
1198*c217d954SCole Faust        .stride_z                      = stride_z,
1199*c217d954SCole Faust        .stride_w                      = stride_w
1200*c217d954SCole Faust    };
1201*c217d954SCole Faust
1202*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1203*c217d954SCole Faust    return tensor;
1204*c217d954SCole Faust}
1205*c217d954SCole Faust
1206*c217d954SCole Faust
1207*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
1208*c217d954SCole Faust{
1209*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
1210*c217d954SCole Faust}
1211*c217d954SCole Faust
1212*c217d954SCole Faust
1213*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
1214*c217d954SCole Faust{
1215*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
1216*c217d954SCole Faust}
1217*c217d954SCole Faust
1218*c217d954SCole Faust
1219*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1220*c217d954SCole Faust{
1221*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1222*c217d954SCole Faust}
1223*c217d954SCole Faust
1224*c217d954SCole Faust
1225*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1226*c217d954SCole Faust{
1227*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1228*c217d954SCole Faust}
1229*c217d954SCole Faust
1230*c217d954SCole Faust
1231*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1232*c217d954SCole Faust{
1233*c217d954SCole Faust    uint num_elements = width * height;
1234*c217d954SCole Faust
1235*c217d954SCole Faust    const uint z = index / num_elements;
1236*c217d954SCole Faust
1237*c217d954SCole Faust    index %= num_elements;
1238*c217d954SCole Faust
1239*c217d954SCole Faust    const uint y = index / width;
1240*c217d954SCole Faust
1241*c217d954SCole Faust    index %= width;
1242*c217d954SCole Faust
1243*c217d954SCole Faust    const uint x = index;
1244*c217d954SCole Faust
1245*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1246*c217d954SCole Faust}
1247*c217d954SCole Faust
1248*c217d954SCole Faust#endif
1249*c217d954SCole Faust
1250*c217d954SCole Faust#if GPU_ARCH == GPU_ARCH_BIFROST
1251*c217d954SCole Faust#define MLA(a, b, c) (fma(c, b, a))
1252*c217d954SCole Faust#else
1253*c217d954SCole Faust#define MLA(a, b, c) ((b) * (c) + (a))
1254*c217d954SCole Faust#endif
1255*c217d954SCole Faust
1256*c217d954SCole Faust
1257*c217d954SCole Faust#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
1258*c217d954SCole Faust
1259*c217d954SCole Faust
1260*c217d954SCole Faust#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
1261*c217d954SCole Faust
1262*c217d954SCole Faust
1263*c217d954SCole Faust#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
1264*c217d954SCole Faust
1265*c217d954SCole Faust
1266*c217d954SCole Faust#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
1267*c217d954SCole Faust
1268*c217d954SCole Faust
1269*c217d954SCole Faust#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
1270*c217d954SCole Faust
1271*c217d954SCole Faust
1272*c217d954SCole Faust#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1273*c217d954SCole Faust
1274*c217d954SCole Faust
1275*c217d954SCole Faust#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
1276*c217d954SCole Faust
1277*c217d954SCole Faust
1278*c217d954SCole Faust#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
1279*c217d954SCole Faust
1280*c217d954SCole Faust
1281*c217d954SCole Faust#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
1282*c217d954SCole Faust
1283*c217d954SCole Faust
1284*c217d954SCole Faust#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
1285*c217d954SCole Faust
1286*c217d954SCole Faust
1287*c217d954SCole Faust#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
1288*c217d954SCole Faust
1289*c217d954SCole Faust
1290*c217d954SCole Faust#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
1291*c217d954SCole Faust
1292*c217d954SCole Faust
1293*c217d954SCole Faust#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
1294*c217d954SCole Faust
1295*c217d954SCole Faust
1296*c217d954SCole Faust#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
1297*c217d954SCole Faust
1298*c217d954SCole Faust
1299*c217d954SCole Faust#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
1300*c217d954SCole Faust
1301*c217d954SCole Faust#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1302*c217d954SCole Faust
1303*c217d954SCole Faust#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1304*c217d954SCole Faust
1305*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
1306*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
1307*c217d954SCole Faust
1308*c217d954SCole Faust
1309*c217d954SCole Faust
1310*c217d954SCole Faust
1311*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1312*c217d954SCole Faust    VSTORE(N0)                                                 \
1313*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1314*c217d954SCole Faust
1315*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1316*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1317*c217d954SCole Faust    VSTORE(N0)                                                 \
1318*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1319*c217d954SCole Faust
1320*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1321*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1322*c217d954SCole Faust    VSTORE(N0)                                                 \
1323*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1324*c217d954SCole Faust
1325*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1326*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1327*c217d954SCole Faust    VSTORE(N0)                                                 \
1328*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1329*c217d954SCole Faust
1330*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1331*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1332*c217d954SCole Faust    VSTORE(N0)                                                 \
1333*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1334*c217d954SCole Faust
1335*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1336*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1337*c217d954SCole Faust    VSTORE(N0)                                                 \
1338*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1339*c217d954SCole Faust
1340*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1341*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1342*c217d954SCole Faust    VSTORE(N0)                                                 \
1343*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1344*c217d954SCole Faust
1345*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1346*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1347*c217d954SCole Faust    VSTORE(N0)                                                 \
1348*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1349*c217d954SCole Faust
1350*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1351*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1352*c217d954SCole Faust    VSTORE(N0)                                                 \
1353*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1354*c217d954SCole Faust
1355*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1356*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1357*c217d954SCole Faust    VSTORE(N0)                                                  \
1358*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1359*c217d954SCole Faust
1360*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1361*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1362*c217d954SCole Faust    VSTORE(N0)                                                  \
1363*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1364*c217d954SCole Faust
1365*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1366*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1367*c217d954SCole Faust    VSTORE(N0)                                                  \
1368*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1369*c217d954SCole Faust
1370*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1371*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1372*c217d954SCole Faust    VSTORE(N0)                                                  \
1373*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1374*c217d954SCole Faust
1375*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1376*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1377*c217d954SCole Faust    VSTORE(N0)                                                  \
1378*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1379*c217d954SCole Faust
1380*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1381*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1382*c217d954SCole Faust    VSTORE(N0)                                                  \
1383*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1384*c217d954SCole Faust
1385*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1386*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1387*c217d954SCole Faust    VSTORE(N0)                                                  \
1388*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1389*c217d954SCole Faust
1390*c217d954SCole Faust
1391*c217d954SCole Faust
1392*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1393*c217d954SCole Faust    VSTORE(N0)                                                         \
1394*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1395*c217d954SCole Faust
1396*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1397*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1398*c217d954SCole Faust    VSTORE(N0)                                                         \
1399*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1400*c217d954SCole Faust
1401*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1402*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1403*c217d954SCole Faust    VSTORE(N0)                                                         \
1404*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1405*c217d954SCole Faust
1406*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1407*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1408*c217d954SCole Faust    VSTORE(N0)                                                         \
1409*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1410*c217d954SCole Faust
1411*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1412*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1413*c217d954SCole Faust    VSTORE(N0)                                                         \
1414*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1415*c217d954SCole Faust
1416*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1417*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1418*c217d954SCole Faust    VSTORE(N0)                                                         \
1419*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1420*c217d954SCole Faust
1421*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1422*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1423*c217d954SCole Faust    VSTORE(N0)                                                         \
1424*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1425*c217d954SCole Faust
1426*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1427*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1428*c217d954SCole Faust    VSTORE(N0)                                                         \
1429*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1430*c217d954SCole Faust
1431*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1432*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1433*c217d954SCole Faust    VSTORE(N0)                                                         \
1434*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1435*c217d954SCole Faust
1436*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1437*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1438*c217d954SCole Faust    VSTORE(N0)                                                     \
1439*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1440*c217d954SCole Faust
1441*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1442*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1443*c217d954SCole Faust    VSTORE(N0)                                                          \
1444*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1445*c217d954SCole Faust
1446*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1447*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1448*c217d954SCole Faust    VSTORE(N0)                                                          \
1449*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1450*c217d954SCole Faust
1451*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1452*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1453*c217d954SCole Faust    VSTORE(N0)                                                          \
1454*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1455*c217d954SCole Faust
1456*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1457*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1458*c217d954SCole Faust    VSTORE(N0)                                                          \
1459*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1460*c217d954SCole Faust
1461*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1462*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1463*c217d954SCole Faust    VSTORE(N0)                                                          \
1464*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1465*c217d954SCole Faust
1466*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1467*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1468*c217d954SCole Faust    VSTORE(N0)                                                          \
1469*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1470*c217d954SCole Faust
1471*c217d954SCole Faust
1472*c217d954SCole Faust
1473*c217d954SCole Faust
1474*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1475*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1476*c217d954SCole Faust
1477*c217d954SCole Faust
1478*c217d954SCole Faust
1479*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1480*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1481*c217d954SCole Faust
1482*c217d954SCole Faust
1483*c217d954SCole Faust
1484*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1486*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1487*c217d954SCole Faust
1488*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1489*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1490*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1491*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1492*c217d954SCole Faust
1493*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1494*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1495*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1496*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1497*c217d954SCole Faust
1498*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1499*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1500*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1501*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1502*c217d954SCole Faust
1503*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1504*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1505*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1506*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1507*c217d954SCole Faust
1508*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1509*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1510*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1511*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1512*c217d954SCole Faust
1513*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1514*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1515*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1516*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1517*c217d954SCole Faust
1518*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1519*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1520*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1521*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1522*c217d954SCole Faust
1523*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1524*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1525*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1526*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1527*c217d954SCole Faust
1528*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1529*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1530*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1531*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1532*c217d954SCole Faust
1533*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1534*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1535*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1536*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1537*c217d954SCole Faust
1538*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1539*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1540*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1541*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1542*c217d954SCole Faust
1543*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1544*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1545*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1546*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1547*c217d954SCole Faust
1548*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1549*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1550*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1551*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1552*c217d954SCole Faust
1553*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1554*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1555*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1556*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1557*c217d954SCole Faust
1558*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1559*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1560*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1561*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1562*c217d954SCole Faust
1563*c217d954SCole Faust
1564*c217d954SCole Faust
1565*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1566*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1567*c217d954SCole Faust
1568*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1569*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1570*c217d954SCole Faust    {                                                                                                                                                     \
1571*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1572*c217d954SCole Faust    }                                                                                                                                                     \
1573*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1574*c217d954SCole Faust    {                                                                                                                                                     \
1575*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1576*c217d954SCole Faust    }                                                                                                                                                     \
1577*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1578*c217d954SCole Faust    {                                                                                                                                                     \
1579*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1580*c217d954SCole Faust    }                                                                                                                                                     \
1581*c217d954SCole Faust    else                                                                                                                                                  \
1582*c217d954SCole Faust    {                                                                                                                                                     \
1583*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1584*c217d954SCole Faust    }
1585*c217d954SCole Faust
1586*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1587*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
1588*c217d954SCole Faust    {                                                                                                             \
1589*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1590*c217d954SCole Faust    }                                                                                                             \
1591*c217d954SCole Faust    else                                                                                                          \
1592*c217d954SCole Faust    {                                                                                                             \
1593*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1594*c217d954SCole Faust    }
1595*c217d954SCole Faust
1596*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1597*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
1598*c217d954SCole Faust    {                                                                                                             \
1599*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1600*c217d954SCole Faust    }                                                                                                             \
1601*c217d954SCole Faust    else                                                                                                          \
1602*c217d954SCole Faust    {                                                                                                             \
1603*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1604*c217d954SCole Faust    }
1605*c217d954SCole Faust
1606*c217d954SCole Faust
1607*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1608*c217d954SCole Faust
1609*c217d954SCole Faust
1610*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1611*c217d954SCole Faust
1612*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1613*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1614*c217d954SCole Faust
1615*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1616*c217d954SCole Faust
1617*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1618*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1619*c217d954SCole Faust
1620*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1621*c217d954SCole Faust
1622*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1623*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1624*c217d954SCole Faust
1625*c217d954SCole Faust#else
1626*c217d954SCole Faust
1627*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1628*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1629*c217d954SCole Faust
1630*c217d954SCole Faust#endif
1631*c217d954SCole Faust
1632*c217d954SCole Faust#endif
1633*c217d954SCole Faust
1634*c217d954SCole Faust
1635*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
1636*c217d954SCole Faust
1637*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1638*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1639*c217d954SCole Faust#else
1640*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1641*c217d954SCole Faust    ((uint)(y * M0))
1642*c217d954SCole Faust#endif
1643*c217d954SCole Faust
1644*c217d954SCole Faust
1645*c217d954SCole Faust
1646*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1647*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1648*c217d954SCole Faust
1649*c217d954SCole Faust
1650*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1651*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1652*c217d954SCole Faust#endif
1653*c217d954SCole Faust
1654*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1655*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1656*c217d954SCole Faust#endif
1657*c217d954SCole Faust
1658*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1659*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1660*c217d954SCole Faust#endif
1661*c217d954SCole Faust
1662*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1663*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
1664*c217d954SCole Faust#endif
1665*c217d954SCole Faust
1666*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
1667*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
1668*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
1669*c217d954SCole Faust
1670*c217d954SCole Faust
1671*c217d954SCole Faust#define CONCAT(a, b) a##b
1672*c217d954SCole Faust
1673*c217d954SCole Faust
1674*c217d954SCole Faust#define EXPAND(x) x
1675*c217d954SCole Faust
1676*c217d954SCole Faust
1677*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1678*c217d954SCole Faust
1679*c217d954SCole Faust
1680*c217d954SCole Faust#define REV1(x) ((x))
1681*c217d954SCole Faust#define REV2(x) ((x).s10)
1682*c217d954SCole Faust#define REV3(x) ((x).s210)
1683*c217d954SCole Faust#define REV4(x) ((x).s3210)
1684*c217d954SCole Faust#define REV8(x) ((x).s76543210)
1685*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
1686*c217d954SCole Faust
1687*c217d954SCole Faust
1688*c217d954SCole Faust
1689*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
1690*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
1691*c217d954SCole Faust
1692*c217d954SCole Faust
1693*c217d954SCole Faust
1694*c217d954SCole Faust#define ROT1_0(x) ((x))
1695*c217d954SCole Faust#define ROT1_1(x) ((x))
1696*c217d954SCole Faust
1697*c217d954SCole Faust#define ROT2_0(x) ((x))
1698*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
1699*c217d954SCole Faust#define ROT2_2(x) ((x))
1700*c217d954SCole Faust
1701*c217d954SCole Faust#define ROT3_0(x) ((x))
1702*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
1703*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
1704*c217d954SCole Faust#define ROT3_3(x) ((x))
1705*c217d954SCole Faust
1706*c217d954SCole Faust#define ROT4_0(x) ((x))
1707*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
1708*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
1709*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
1710*c217d954SCole Faust#define ROT4_4(x) ((x))
1711*c217d954SCole Faust
1712*c217d954SCole Faust#define ROT8_0(x) ((x))
1713*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
1714*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
1715*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
1716*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
1717*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
1718*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
1719*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
1720*c217d954SCole Faust#define ROT8_8(x) ((x))
1721*c217d954SCole Faust
1722*c217d954SCole Faust#define ROT16_0(x) ((x))
1723*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
1724*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
1725*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
1726*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
1727*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
1728*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
1729*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
1730*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
1731*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
1732*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
1733*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
1734*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
1735*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
1736*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
1737*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
1738*c217d954SCole Faust#define ROT16_16(x) ((x))
1739*c217d954SCole Faust
1740*c217d954SCole Faust
1741*c217d954SCole Faust
1742*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1743*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1744*c217d954SCole Faust
1745*c217d954SCole Faust
1746*c217d954SCole Faust
1747*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
1748*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
1749*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1750*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1751*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1752*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1753*c217d954SCole Faust
1754*c217d954SCole Faust
1755*c217d954SCole Faust
1756*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1757*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1758*c217d954SCole Faust
1759*c217d954SCole Faust
1760*c217d954SCole Faust#define VLOAD_STR(size) vload##size
1761*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
1762*c217d954SCole Faust
1763*c217d954SCole Faust
1764*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1765*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1766*c217d954SCole Faust
1767*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
1768*c217d954SCole Faust    {                            \
1769*c217d954SCole Faust    }
1770*c217d954SCole Faust
1771*c217d954SCole Faust
1772*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
1773*c217d954SCole Faust#define vload_partial_1_1 vload1
1774*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
1775*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
1776*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
1777*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
1778*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
1779*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
1780*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
1781*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
1782*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
1783*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
1784*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
1785*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
1786*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
1787*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
1788*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
1789*c217d954SCole Faust
1790*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
1791*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
1792*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
1793*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
1794*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
1795*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
1796*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
1797*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
1798*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
1799*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
1800*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
1801*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
1802*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
1803*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
1804*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
1805*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
1806*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
1807*c217d954SCole Faust
1808*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
1809*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
1810*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
1811*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
1812*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
1813*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
1814*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
1815*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
1816*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
1817*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
1818*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
1819*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
1820*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
1821*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
1822*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
1823*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
1824*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
1825*c217d954SCole Faust
1826*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
1827*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
1828*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
1829*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
1830*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
1831*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
1832*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
1833*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
1834*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
1835*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
1836*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
1837*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
1838*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
1839*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
1840*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
1841*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
1842*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
1843*c217d954SCole Faust
1844*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
1845*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
1846*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
1847*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
1848*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
1849*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
1850*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
1851*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
1852*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
1853*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
1854*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
1855*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
1856*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
1857*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
1858*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
1859*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
1860*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
1861*c217d954SCole Faust
1862*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
1863*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
1864*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
1865*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
1866*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
1867*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
1868*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
1869*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
1870*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
1871*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
1872*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
1873*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
1874*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
1875*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
1876*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
1877*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
1878*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
1879*c217d954SCole Faust
1880*c217d954SCole Faust
1881*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
1882*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
1883*c217d954SCole Faust
1884*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
1885*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
1886*c217d954SCole Faust
1887*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
1888*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
1889*c217d954SCole Faust
1890*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
1891*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
1892*c217d954SCole Faust
1893*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
1894*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1895*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
1896*c217d954SCole Faust
1897*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
1898*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1899*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1900*c217d954SCole Faust
1901*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
1902*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1903*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1904*c217d954SCole Faust
1905*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
1906*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
1907*c217d954SCole Faust
1908*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
1909*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1910*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
1911*c217d954SCole Faust
1912*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
1913*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1914*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1915*c217d954SCole Faust
1916*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
1917*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1918*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1919*c217d954SCole Faust
1920*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
1921*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1922*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1923*c217d954SCole Faust
1924*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
1925*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1926*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1927*c217d954SCole Faust
1928*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
1929*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1930*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1931*c217d954SCole Faust
1932*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
1933*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1934*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1935*c217d954SCole Faust
1936*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
1937*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
1938*c217d954SCole Faust
1939*c217d954SCole Faust
1940*c217d954SCole Faust
1941*c217d954SCole Faust#define PIXEL_UNIT4 1
1942*c217d954SCole Faust#define PIXEL_UNIT8 2
1943*c217d954SCole Faust#define PIXEL_UNIT16 4
1944*c217d954SCole Faust
1945*c217d954SCole Faust
1946*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1947*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1948*c217d954SCole Faust
1949*c217d954SCole Faust
1950*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1951*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1952*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1953*c217d954SCole Faust
1954*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1955*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1956*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1957*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1958*c217d954SCole Faust#endif
1959*c217d954SCole Faust
1960*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1961*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1962*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1963*c217d954SCole Faust
1964*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1965*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1966*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1967*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1968*c217d954SCole Faust#endif
1969*c217d954SCole Faust
1970*c217d954SCole Faust
1971*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1972*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1973*c217d954SCole Faust
1974*c217d954SCole Faust
1975*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1976*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1977*c217d954SCole Faust
1978*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
1979*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
1980*c217d954SCole Faust
1981*c217d954SCole Faust#define float1 float
1982*c217d954SCole Faust#define half1 half
1983*c217d954SCole Faust#define char1 char
1984*c217d954SCole Faust#define uchar1 uchar
1985*c217d954SCole Faust#define short1 short
1986*c217d954SCole Faust#define ushort1 ushort
1987*c217d954SCole Faust#define int1 int
1988*c217d954SCole Faust#define uint1 uint
1989*c217d954SCole Faust#define long1 long
1990*c217d954SCole Faust#define ulong1 ulong
1991*c217d954SCole Faust#define double1 double
1992*c217d954SCole Faust
1993*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1994*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1995*c217d954SCole Faust
1996*c217d954SCole Faust
1997*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1998*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1999*c217d954SCole Faust
2000*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
2001*c217d954SCole Faust    {                             \
2002*c217d954SCole Faust    }
2003*c217d954SCole Faust
2004*c217d954SCole Faust
2005*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
2006*c217d954SCole Faust#define vstore_partial_1_1 vstore1
2007*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
2008*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
2009*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
2010*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
2011*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
2012*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
2013*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
2014*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
2015*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
2016*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
2017*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
2018*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
2019*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
2020*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
2021*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
2022*c217d954SCole Faust
2023*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
2024*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
2025*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
2026*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
2027*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
2028*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
2029*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
2030*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
2031*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
2032*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
2033*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
2034*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
2035*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
2036*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
2037*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
2038*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
2039*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
2040*c217d954SCole Faust
2041*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
2042*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
2043*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
2044*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
2045*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
2046*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
2047*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
2048*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
2049*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
2050*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
2051*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
2052*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
2053*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
2054*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
2055*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
2056*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
2057*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
2058*c217d954SCole Faust
2059*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
2060*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
2061*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
2062*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
2063*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
2064*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
2065*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
2066*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
2067*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
2068*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
2069*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
2070*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
2071*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
2072*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
2073*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
2074*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
2075*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
2076*c217d954SCole Faust
2077*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
2078*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
2079*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
2080*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
2081*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
2082*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
2083*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
2084*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
2085*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
2086*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
2087*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
2088*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
2089*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
2090*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
2091*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
2092*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
2093*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
2094*c217d954SCole Faust
2095*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
2096*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
2097*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
2098*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
2099*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
2100*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
2101*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
2102*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
2103*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
2104*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
2105*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
2106*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
2107*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
2108*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
2109*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
2110*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
2111*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
2112*c217d954SCole Faust
2113*c217d954SCole Faust
2114*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
2115*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
2116*c217d954SCole Faust
2117*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
2118*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
2119*c217d954SCole Faust
2120*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
2121*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
2122*c217d954SCole Faust
2123*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
2124*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
2125*c217d954SCole Faust
2126*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
2127*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2128*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
2129*c217d954SCole Faust
2130*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
2131*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2132*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2133*c217d954SCole Faust
2134*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
2135*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2136*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2137*c217d954SCole Faust
2138*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
2139*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
2140*c217d954SCole Faust
2141*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
2142*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2143*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
2144*c217d954SCole Faust
2145*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
2146*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2147*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2148*c217d954SCole Faust
2149*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
2150*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2151*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2152*c217d954SCole Faust
2153*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
2154*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2155*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2156*c217d954SCole Faust
2157*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
2158*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2159*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2160*c217d954SCole Faust
2161*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
2162*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2163*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2164*c217d954SCole Faust
2165*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
2166*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2167*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2168*c217d954SCole Faust
2169*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
2170*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
2171*c217d954SCole Faust
2172*c217d954SCole Faust
2173*c217d954SCole Faust
2174*c217d954SCole Faust
2175*c217d954SCole Faust
2176*c217d954SCole Faust#define convert_float_sat convert_float
2177*c217d954SCole Faust#define convert_float1_sat convert_float
2178*c217d954SCole Faust#define convert_float2_sat convert_float2
2179*c217d954SCole Faust#define convert_float3_sat convert_float3
2180*c217d954SCole Faust#define convert_float4_sat convert_float4
2181*c217d954SCole Faust#define convert_float8_sat convert_float8
2182*c217d954SCole Faust#define convert_float16_sat convert_float16
2183*c217d954SCole Faust#define convert_half_sat convert_float
2184*c217d954SCole Faust#define convert_half1_sat convert_half
2185*c217d954SCole Faust#define convert_half2_sat convert_half2
2186*c217d954SCole Faust#define convert_half3_sat convert_half3
2187*c217d954SCole Faust#define convert_half4_sat convert_half4
2188*c217d954SCole Faust#define convert_half8_sat convert_half8
2189*c217d954SCole Faust#define convert_half16_sat convert_half16
2190*c217d954SCole Faust
2191*c217d954SCole Faust#define convert_float1 convert_float
2192*c217d954SCole Faust#define convert_half1 convert_half
2193*c217d954SCole Faust#define convert_char1 convert_char
2194*c217d954SCole Faust#define convert_uchar1 convert_uchar
2195*c217d954SCole Faust#define convert_short1 convert_short
2196*c217d954SCole Faust#define convert_ushort1 convert_ushort
2197*c217d954SCole Faust#define convert_int1 convert_int
2198*c217d954SCole Faust#define convert_uint1 convert_uint
2199*c217d954SCole Faust#define convert_long1 convert_long
2200*c217d954SCole Faust#define convert_ulong1 convert_ulong
2201*c217d954SCole Faust#define convert_double1 convert_double
2202*c217d954SCole Faust
2203*c217d954SCole Faust#define convert_char1_sat convert_char_sat
2204*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
2205*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
2206*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
2207*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
2208*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
2209*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
2210*c217d954SCole Faust#define convert_short1_sat convert_short_sat
2211*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
2212*c217d954SCole Faust#define convert_int1_sat convert_int_sat
2213*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
2214*c217d954SCole Faust#define convert_long1_sat convert_long_sat
2215*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
2216*c217d954SCole Faust#define convert_double1_sat convert_double_sat
2217*c217d954SCole Faust
2218*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
2219*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2220*c217d954SCole Faust
2221*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
2222*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
2223*c217d954SCole Faust
2224*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2225*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2226*c217d954SCole Faust
2227*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2228*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2229*c217d954SCole Faust
2230*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
2231*c217d954SCole Faust#define select_vec_dt_char(size) char##size
2232*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
2233*c217d954SCole Faust#define select_vec_dt_short(size) short##size
2234*c217d954SCole Faust#define select_vec_dt_half(size) short##size
2235*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
2236*c217d954SCole Faust#define select_vec_dt_int(size) int##size
2237*c217d954SCole Faust#define select_vec_dt_float(size) int##size
2238*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
2239*c217d954SCole Faust#define select_vec_dt_long(size) long##size
2240*c217d954SCole Faust
2241*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2242*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2243*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2244*c217d954SCole Faust
2245*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
2246*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
2247*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
2248*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
2249*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
2250*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
2251*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
2252*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
2253*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
2254*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
2255*c217d954SCole Faust
2256*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2257*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2258*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2259*c217d954SCole Faust
2260*c217d954SCole Faust#define sum_reduce_1(x) (x)
2261*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2262*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2263*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2264*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2265*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2266*c217d954SCole Faust
2267*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2268*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2269*c217d954SCole Faust
2270*c217d954SCole Faust#define prod_reduce_1(x) (x)
2271*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2272*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2273*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2274*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2275*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2276*c217d954SCole Faust
2277*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2278*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2279*c217d954SCole Faust
2280*c217d954SCole Faust#define max_reduce_1(x) (x)
2281*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
2282*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2283*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2284*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2285*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2286*c217d954SCole Faust
2287*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2288*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2289*c217d954SCole Faust
2290*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
2291*c217d954SCole Faust    __global uchar *name##_ptr,      \
2292*c217d954SCole Faust    uint        name##_stride_x, \
2293*c217d954SCole Faust    uint        name##_step_x,   \
2294*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2295*c217d954SCole Faust
2296*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
2297*c217d954SCole Faust    __global uchar *name##_ptr,      \
2298*c217d954SCole Faust    uint        name##_stride_x, \
2299*c217d954SCole Faust    uint        name##_step_x,   \
2300*c217d954SCole Faust    uint        name##_stride_y, \
2301*c217d954SCole Faust    uint        name##_step_y,   \
2302*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2303*c217d954SCole Faust
2304*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
2305*c217d954SCole Faust    __global uchar *name##_ptr,      \
2306*c217d954SCole Faust    uint        name##_stride_x, \
2307*c217d954SCole Faust    uint        name##_step_x,   \
2308*c217d954SCole Faust    uint        name##_stride_y, \
2309*c217d954SCole Faust    uint        name##_step_y,   \
2310*c217d954SCole Faust    uint        name##_stride_z, \
2311*c217d954SCole Faust    uint        name##_step_z,   \
2312*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2313*c217d954SCole Faust
2314*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
2315*c217d954SCole Faust    __global uchar *name##_ptr,      \
2316*c217d954SCole Faust    uint        name##_stride_x, \
2317*c217d954SCole Faust    uint        name##_step_x,   \
2318*c217d954SCole Faust    uint        name##_stride_y, \
2319*c217d954SCole Faust    uint        name##_step_y,   \
2320*c217d954SCole Faust    uint        name##_stride_z, \
2321*c217d954SCole Faust    uint        name##_step_z,   \
2322*c217d954SCole Faust    uint        name##_stride_w, \
2323*c217d954SCole Faust    uint        name##_step_w,   \
2324*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2325*c217d954SCole Faust
2326*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
2327*c217d954SCole Faust    __global uchar *name##_ptr,      \
2328*c217d954SCole Faust    uint        name##_stride_x, \
2329*c217d954SCole Faust    uint        name##_step_x,   \
2330*c217d954SCole Faust    uint        name##_stride_y, \
2331*c217d954SCole Faust    uint        name##_step_y,   \
2332*c217d954SCole Faust    uint        name##_stride_z, \
2333*c217d954SCole Faust    uint        name##_step_z,   \
2334*c217d954SCole Faust    uint        name##_stride_w, \
2335*c217d954SCole Faust    uint        name##_step_w,   \
2336*c217d954SCole Faust    uint        name##_stride_v, \
2337*c217d954SCole Faust    uint        name##_step_v,   \
2338*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2339*c217d954SCole Faust
2340*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
2341*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2342*c217d954SCole Faust
2343*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2344*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2345*c217d954SCole Faust
2346*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
2347*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2348*c217d954SCole Faust
2349*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2350*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2351*c217d954SCole Faust
2352*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2353*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2354*c217d954SCole Faust
2355*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2356*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2357*c217d954SCole Faust
2358*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2359*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2360*c217d954SCole Faust
2361*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2362*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2363*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
2364*c217d954SCole Faust
2365*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2366*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2367*c217d954SCole Faust
2368*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2369*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2370*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2371*c217d954SCole Faust
2372*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2373*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2374*c217d954SCole Faust
2375*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2376*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2377*c217d954SCole Faust                           name##_stride_z, name##_step_z)
2378*c217d954SCole Faust
2379*c217d954SCole Faust
2380*c217d954SCole Fausttypedef struct Vector
2381*c217d954SCole Faust{
2382*c217d954SCole Faust    __global uchar *ptr;
2383*c217d954SCole Faust    int             offset_first_element_in_bytes;
2384*c217d954SCole Faust    int             stride_x;
2385*c217d954SCole Faust} Vector;
2386*c217d954SCole Faust
2387*c217d954SCole Faust
2388*c217d954SCole Fausttypedef struct Image
2389*c217d954SCole Faust{
2390*c217d954SCole Faust    __global uchar *ptr;
2391*c217d954SCole Faust    int             offset_first_element_in_bytes;
2392*c217d954SCole Faust    int             stride_x;
2393*c217d954SCole Faust    int             stride_y;
2394*c217d954SCole Faust} Image;
2395*c217d954SCole Faust
2396*c217d954SCole Faust
2397*c217d954SCole Fausttypedef struct Tensor3D
2398*c217d954SCole Faust{
2399*c217d954SCole Faust    __global uchar *ptr;
2400*c217d954SCole Faust    int             offset_first_element_in_bytes;
2401*c217d954SCole Faust    int             stride_x;
2402*c217d954SCole Faust    int             stride_y;
2403*c217d954SCole Faust    int             stride_z;
2404*c217d954SCole Faust} Tensor3D;
2405*c217d954SCole Faust
2406*c217d954SCole Faust
2407*c217d954SCole Fausttypedef struct Tensor4D
2408*c217d954SCole Faust{
2409*c217d954SCole Faust    __global uchar *ptr;
2410*c217d954SCole Faust    int             offset_first_element_in_bytes;
2411*c217d954SCole Faust    int             stride_x;
2412*c217d954SCole Faust    int             stride_y;
2413*c217d954SCole Faust    int             stride_z;
2414*c217d954SCole Faust    int             stride_w;
2415*c217d954SCole Faust} Tensor4D;
2416*c217d954SCole Faust
2417*c217d954SCole Faust
2418*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2419*c217d954SCole Faust{
2420*c217d954SCole Faust    Vector vector =
2421*c217d954SCole Faust    {
2422*c217d954SCole Faust        .ptr                           = ptr,
2423*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2424*c217d954SCole Faust        .stride_x                      = stride_x,
2425*c217d954SCole Faust    };
2426*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2427*c217d954SCole Faust    return vector;
2428*c217d954SCole Faust}
2429*c217d954SCole Faust
2430*c217d954SCole Faust
2431*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2432*c217d954SCole Faust{
2433*c217d954SCole Faust    Image img =
2434*c217d954SCole Faust    {
2435*c217d954SCole Faust        .ptr                           = ptr,
2436*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2437*c217d954SCole Faust        .stride_x                      = stride_x,
2438*c217d954SCole Faust        .stride_y                      = stride_y
2439*c217d954SCole Faust    };
2440*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2441*c217d954SCole Faust    return img;
2442*c217d954SCole Faust}
2443*c217d954SCole Faust
2444*c217d954SCole Faust
2445*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2446*c217d954SCole Faust{
2447*c217d954SCole Faust    Image img =
2448*c217d954SCole Faust    {
2449*c217d954SCole Faust        .ptr                           = ptr,
2450*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2451*c217d954SCole Faust        .stride_x                      = stride_x,
2452*c217d954SCole Faust        .stride_y                      = stride_y
2453*c217d954SCole Faust    };
2454*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2455*c217d954SCole Faust    return img;
2456*c217d954SCole Faust}
2457*c217d954SCole Faust
2458*c217d954SCole Faust
2459*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2460*c217d954SCole Faust{
2461*c217d954SCole Faust    Tensor3D tensor =
2462*c217d954SCole Faust    {
2463*c217d954SCole Faust        .ptr                           = ptr,
2464*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2465*c217d954SCole Faust        .stride_x                      = stride_x,
2466*c217d954SCole Faust        .stride_y                      = stride_y,
2467*c217d954SCole Faust        .stride_z                      = stride_z
2468*c217d954SCole Faust    };
2469*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2470*c217d954SCole Faust    return tensor;
2471*c217d954SCole Faust}
2472*c217d954SCole Faust
2473*c217d954SCole Faust
2474*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2475*c217d954SCole Faust{
2476*c217d954SCole Faust    Tensor3D tensor =
2477*c217d954SCole Faust    {
2478*c217d954SCole Faust        .ptr                           = ptr,
2479*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2480*c217d954SCole Faust        .stride_x                      = stride_x,
2481*c217d954SCole Faust        .stride_y                      = stride_y,
2482*c217d954SCole Faust        .stride_z                      = stride_z
2483*c217d954SCole Faust    };
2484*c217d954SCole Faust    return tensor;
2485*c217d954SCole Faust}
2486*c217d954SCole Faust
2487*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2488*c217d954SCole Faust                                             uint step_w,
2489*c217d954SCole Faust                                             uint mod_size)
2490*c217d954SCole Faust{
2491*c217d954SCole Faust    Tensor4D tensor =
2492*c217d954SCole Faust    {
2493*c217d954SCole Faust        .ptr                           = ptr,
2494*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2495*c217d954SCole Faust        .stride_x                      = stride_x,
2496*c217d954SCole Faust        .stride_y                      = stride_y,
2497*c217d954SCole Faust        .stride_z                      = stride_z,
2498*c217d954SCole Faust        .stride_w                      = stride_w
2499*c217d954SCole Faust    };
2500*c217d954SCole Faust
2501*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2502*c217d954SCole Faust    return tensor;
2503*c217d954SCole Faust}
2504*c217d954SCole Faust
2505*c217d954SCole Faust
2506*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
2507*c217d954SCole Faust{
2508*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
2509*c217d954SCole Faust}
2510*c217d954SCole Faust
2511*c217d954SCole Faust
2512*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
2513*c217d954SCole Faust{
2514*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
2515*c217d954SCole Faust}
2516*c217d954SCole Faust
2517*c217d954SCole Faust
2518*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2519*c217d954SCole Faust{
2520*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2521*c217d954SCole Faust}
2522*c217d954SCole Faust
2523*c217d954SCole Faust
2524*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2525*c217d954SCole Faust{
2526*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2527*c217d954SCole Faust}
2528*c217d954SCole Faust
2529*c217d954SCole Faust
2530*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2531*c217d954SCole Faust{
2532*c217d954SCole Faust    uint num_elements = width * height;
2533*c217d954SCole Faust
2534*c217d954SCole Faust    const uint z = index / num_elements;
2535*c217d954SCole Faust
2536*c217d954SCole Faust    index %= num_elements;
2537*c217d954SCole Faust
2538*c217d954SCole Faust    const uint y = index / width;
2539*c217d954SCole Faust
2540*c217d954SCole Faust    index %= width;
2541*c217d954SCole Faust
2542*c217d954SCole Faust    const uint x = index;
2543*c217d954SCole Faust
2544*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2545*c217d954SCole Faust}
2546*c217d954SCole Faust
2547*c217d954SCole Faust#endif
2548*c217d954SCole Faust
2549*c217d954SCole Faust
2550*c217d954SCole Faust#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
2551*c217d954SCole Faust#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
2552*c217d954SCole Faust
2553*c217d954SCole Faust
2554*c217d954SCole Faust#define scalar_access_0_1(x) ((x).s0)
2555*c217d954SCole Faust#define scalar_access_0_2(x) ((x).s01)
2556*c217d954SCole Faust#define scalar_access_0_3(x) ((x).s012)
2557*c217d954SCole Faust#define scalar_access_0_4(x) ((x).s0123)
2558*c217d954SCole Faust#define scalar_access_0_8(x) ((x).s01234567)
2559*c217d954SCole Faust#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
2560*c217d954SCole Faust
2561*c217d954SCole Faust
2562*c217d954SCole Faust#define scalar_access_1_1(x) ((x).s1)
2563*c217d954SCole Faust#define scalar_access_1_2(x) ((x).s12)
2564*c217d954SCole Faust#define scalar_access_1_3(x) ((x).s123)
2565*c217d954SCole Faust#define scalar_access_1_4(x) ((x).s1234)
2566*c217d954SCole Faust#define scalar_access_1_8(x) ((x).s12345678)
2567*c217d954SCole Faust
2568*c217d954SCole Faust
2569*c217d954SCole Faust#define scalar_access_2_1(x) ((x).s2)
2570*c217d954SCole Faust#define scalar_access_2_2(x) ((x).s23)
2571*c217d954SCole Faust#define scalar_access_2_3(x) ((x).s234)
2572*c217d954SCole Faust#define scalar_access_2_4(x) ((x).s2345)
2573*c217d954SCole Faust#define scalar_access_2_8(x) ((x).s23456789)
2574*c217d954SCole Faust
2575*c217d954SCole Faust
2576*c217d954SCole Faust#define scalar_access_3_1(x) ((x).s3)
2577*c217d954SCole Faust#define scalar_access_3_2(x) ((x).s34)
2578*c217d954SCole Faust#define scalar_access_3_3(x) ((x).s345)
2579*c217d954SCole Faust#define scalar_access_3_4(x) ((x).s3456)
2580*c217d954SCole Faust#define scalar_access_3_8(x) ((x).s3456789A)
2581*c217d954SCole Faust
2582*c217d954SCole Faust
2583*c217d954SCole Faust#define scalar_access_4_1(x) ((x).s4)
2584*c217d954SCole Faust#define scalar_access_4_2(x) ((x).s45)
2585*c217d954SCole Faust#define scalar_access_4_3(x) ((x).s456)
2586*c217d954SCole Faust#define scalar_access_4_4(x) ((x).s4567)
2587*c217d954SCole Faust#define scalar_access_4_8(x) ((x).s456789AB)
2588*c217d954SCole Faust
2589*c217d954SCole Faust
2590*c217d954SCole Faust#define scalar_access_8_1(x) ((x).s8)
2591*c217d954SCole Faust#define scalar_access_8_2(x) ((x).s89)
2592*c217d954SCole Faust#define scalar_access_8_3(x) ((x).s89A)
2593*c217d954SCole Faust#define scalar_access_8_4(x) ((x).s89AB)
2594*c217d954SCole Faust#define scalar_access_8_8(x) ((x).s89ABCDEF)
2595*c217d954SCole Faust
2596*c217d954SCole Faust
2597*c217d954SCole Faust#define scalar_access_12_1(x) ((x).sC)
2598*c217d954SCole Faust#define scalar_access_12_2(x) ((x).sCD)
2599*c217d954SCole Faust#define scalar_access_12_3(x) ((x).sCDE)
2600*c217d954SCole Faust#define scalar_access_12_4(x) ((x).sCDEF)
2601*c217d954SCole Faust
2602*c217d954SCole Faust
2603*c217d954SCole Faust#define scalar_access_16_1(x) ((x).sF)
2604*c217d954SCole Faust
2605*c217d954SCole Faust
2606*c217d954SCole Faust#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2607*c217d954SCole Faust    ({})
2608*c217d954SCole Faust
2609*c217d954SCole Faust#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2610*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2611*c217d954SCole Faust
2612*c217d954SCole Faust#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2613*c217d954SCole Faust    LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2614*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2615*c217d954SCole Faust
2616*c217d954SCole Faust#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2617*c217d954SCole Faust    LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2618*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2619*c217d954SCole Faust
2620*c217d954SCole Faust#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2621*c217d954SCole Faust    LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2622*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2623*c217d954SCole Faust
2624*c217d954SCole Faust#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2625*c217d954SCole Faust    LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2626*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2627*c217d954SCole Faust
2628*c217d954SCole Faust#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2629*c217d954SCole Faust    LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2630*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2631*c217d954SCole Faust
2632*c217d954SCole Faust#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2633*c217d954SCole Faust    LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2634*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2635*c217d954SCole Faust
2636*c217d954SCole Faust#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2637*c217d954SCole Faust    LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2638*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2639*c217d954SCole Faust
2640*c217d954SCole Faust#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2641*c217d954SCole Faust    LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2642*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2643*c217d954SCole Faust
2644*c217d954SCole Faust#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2645*c217d954SCole Faust    LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)      \
2646*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2647*c217d954SCole Faust
2648*c217d954SCole Faust#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2649*c217d954SCole Faust    LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2650*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2651*c217d954SCole Faust
2652*c217d954SCole Faust#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2653*c217d954SCole Faust    LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2654*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2655*c217d954SCole Faust
2656*c217d954SCole Faust#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2657*c217d954SCole Faust    LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2658*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2659*c217d954SCole Faust
2660*c217d954SCole Faust#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2661*c217d954SCole Faust    LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2662*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2663*c217d954SCole Faust
2664*c217d954SCole Faust#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2665*c217d954SCole Faust    LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2666*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2667*c217d954SCole Faust
2668*c217d954SCole Faust#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2669*c217d954SCole Faust    LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2670*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2671*c217d954SCole Faust
2672*c217d954SCole Faust
2673*c217d954SCole Faust
2674*c217d954SCole Faust#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2675*c217d954SCole Faust#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2676*c217d954SCole Faust
2677*c217d954SCole Faust
2678*c217d954SCole Faust
2679*c217d954SCole Faust#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2680*c217d954SCole Faust    ({})
2681*c217d954SCole Faust
2682*c217d954SCole Faust#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2683*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2684*c217d954SCole Faust
2685*c217d954SCole Faust#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2686*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2687*c217d954SCole Faust
2688*c217d954SCole Faust#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2689*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2690*c217d954SCole Faust
2691*c217d954SCole Faust#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2692*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2693*c217d954SCole Faust
2694*c217d954SCole Faust#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2695*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2696*c217d954SCole Faust    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2697*c217d954SCole Faust
2698*c217d954SCole Faust#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2699*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2700*c217d954SCole Faust    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2701*c217d954SCole Faust
2702*c217d954SCole Faust#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2703*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2704*c217d954SCole Faust    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2705*c217d954SCole Faust
2706*c217d954SCole Faust#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2707*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2708*c217d954SCole Faust
2709*c217d954SCole Faust#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2710*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);        \
2711*c217d954SCole Faust    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2712*c217d954SCole Faust
2713*c217d954SCole Faust#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2714*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2715*c217d954SCole Faust    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2716*c217d954SCole Faust
2717*c217d954SCole Faust#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2718*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2719*c217d954SCole Faust    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2720*c217d954SCole Faust
2721*c217d954SCole Faust#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2722*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2723*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2724*c217d954SCole Faust
2725*c217d954SCole Faust#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2726*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2727*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2728*c217d954SCole Faust    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2729*c217d954SCole Faust
2730*c217d954SCole Faust#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2731*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);                          \
2732*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2733*c217d954SCole Faust    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2734*c217d954SCole Faust
2735*c217d954SCole Faust#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2736*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2737*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2738*c217d954SCole Faust    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2739*c217d954SCole Faust
2740*c217d954SCole Faust#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2741*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2742*c217d954SCole Faust
2743*c217d954SCole Faust
2744*c217d954SCole Faust
2745*c217d954SCole Faust#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2746*c217d954SCole Faust#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2747*c217d954SCole Faust
2748*c217d954SCole Faust
2749*c217d954SCole Faust#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2750*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2751*c217d954SCole Faust    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2752*c217d954SCole Faust
2753*c217d954SCole Faust#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2754*c217d954SCole Faust    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2755*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2756*c217d954SCole Faust    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2757*c217d954SCole Faust
2758*c217d954SCole Faust#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2759*c217d954SCole Faust    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2760*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2761*c217d954SCole Faust    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2762*c217d954SCole Faust
2763*c217d954SCole Faust#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2764*c217d954SCole Faust    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2765*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2766*c217d954SCole Faust    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2767*c217d954SCole Faust
2768*c217d954SCole Faust#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2769*c217d954SCole Faust    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2770*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2771*c217d954SCole Faust    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2772*c217d954SCole Faust
2773*c217d954SCole Faust#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2774*c217d954SCole Faust    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2775*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2776*c217d954SCole Faust    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2777*c217d954SCole Faust
2778*c217d954SCole Faust#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2779*c217d954SCole Faust    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2780*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2781*c217d954SCole Faust    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2782*c217d954SCole Faust
2783*c217d954SCole Faust#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2784*c217d954SCole Faust    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2785*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2786*c217d954SCole Faust    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2787*c217d954SCole Faust
2788*c217d954SCole Faust#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2789*c217d954SCole Faust    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2790*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2791*c217d954SCole Faust    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2792*c217d954SCole Faust
2793*c217d954SCole Faust#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2794*c217d954SCole Faust    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2795*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2796*c217d954SCole Faust    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2797*c217d954SCole Faust
2798*c217d954SCole Faust#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2799*c217d954SCole Faust    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2800*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2801*c217d954SCole Faust    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2802*c217d954SCole Faust
2803*c217d954SCole Faust#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2804*c217d954SCole Faust    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2805*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2806*c217d954SCole Faust    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2807*c217d954SCole Faust
2808*c217d954SCole Faust#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2809*c217d954SCole Faust    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2810*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2811*c217d954SCole Faust    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2812*c217d954SCole Faust
2813*c217d954SCole Faust#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2814*c217d954SCole Faust    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2815*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2816*c217d954SCole Faust    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2817*c217d954SCole Faust
2818*c217d954SCole Faust#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2819*c217d954SCole Faust    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2820*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2821*c217d954SCole Faust    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2822*c217d954SCole Faust
2823*c217d954SCole Faust#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2824*c217d954SCole Faust    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2825*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2826*c217d954SCole Faust    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2827*c217d954SCole Faust
2828*c217d954SCole Faust
2829*c217d954SCole Faust
2830*c217d954SCole Faust
2831*c217d954SCole Faust#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2832*c217d954SCole Faust#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2833*c217d954SCole Faust
2834*c217d954SCole Faust
2835*c217d954SCole Faust
2836*c217d954SCole Faust#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2837*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2838*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2839*c217d954SCole Faust
2840*c217d954SCole Faust#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2841*c217d954SCole Faust    LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2842*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2843*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2844*c217d954SCole Faust
2845*c217d954SCole Faust#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2846*c217d954SCole Faust    LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2847*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2848*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2849*c217d954SCole Faust
2850*c217d954SCole Faust#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2851*c217d954SCole Faust    LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2852*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2853*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2854*c217d954SCole Faust
2855*c217d954SCole Faust#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2856*c217d954SCole Faust    LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2857*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2858*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2859*c217d954SCole Faust
2860*c217d954SCole Faust#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2861*c217d954SCole Faust    LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2862*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2863*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2864*c217d954SCole Faust
2865*c217d954SCole Faust#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2866*c217d954SCole Faust    LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2867*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2868*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2869*c217d954SCole Faust
2870*c217d954SCole Faust#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2871*c217d954SCole Faust    LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2872*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2873*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2874*c217d954SCole Faust
2875*c217d954SCole Faust#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2876*c217d954SCole Faust    LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2877*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2878*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2879*c217d954SCole Faust
2880*c217d954SCole Faust#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2881*c217d954SCole Faust    LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2882*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2883*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2884*c217d954SCole Faust
2885*c217d954SCole Faust#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2886*c217d954SCole Faust    LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2887*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2888*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2889*c217d954SCole Faust
2890*c217d954SCole Faust#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2891*c217d954SCole Faust    LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2892*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2893*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2894*c217d954SCole Faust
2895*c217d954SCole Faust#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2896*c217d954SCole Faust    LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2897*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2898*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2899*c217d954SCole Faust
2900*c217d954SCole Faust#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2901*c217d954SCole Faust    LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2902*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2903*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2904*c217d954SCole Faust
2905*c217d954SCole Faust#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2906*c217d954SCole Faust    LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2907*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2908*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2909*c217d954SCole Faust
2910*c217d954SCole Faust#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2911*c217d954SCole Faust    LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2912*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2913*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2914*c217d954SCole Faust
2915*c217d954SCole Faust
2916*c217d954SCole Faust
2917*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2918*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2919*c217d954SCole Faust
2920*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2921*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
2922*c217d954SCole Faust    {                                                                                                                                                            \
2923*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
2924*c217d954SCole Faust    }                                                                                                                                                            \
2925*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
2926*c217d954SCole Faust    {                                                                                                                                                            \
2927*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2928*c217d954SCole Faust    }                                                                                                                                                            \
2929*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
2930*c217d954SCole Faust    {                                                                                                                                                            \
2931*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2932*c217d954SCole Faust    }                                                                                                                                                            \
2933*c217d954SCole Faust    else                                                                                                                                                         \
2934*c217d954SCole Faust    {                                                                                                                                                            \
2935*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
2936*c217d954SCole Faust    }
2937*c217d954SCole Faust
2938*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
2939*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                                \
2940*c217d954SCole Faust    {                                                                                                                    \
2941*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2942*c217d954SCole Faust    }                                                                                                                    \
2943*c217d954SCole Faust    else                                                                                                                 \
2944*c217d954SCole Faust    {                                                                                                                    \
2945*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2946*c217d954SCole Faust    }
2947*c217d954SCole Faust
2948*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
2949*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                                \
2950*c217d954SCole Faust    {                                                                                                                    \
2951*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2952*c217d954SCole Faust    }                                                                                                                    \
2953*c217d954SCole Faust    else                                                                                                                 \
2954*c217d954SCole Faust    {                                                                                                                    \
2955*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2956*c217d954SCole Faust    }
2957*c217d954SCole Faust
2958*c217d954SCole Faust
2959*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2960*c217d954SCole Faust
2961*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2962*c217d954SCole Faust    LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2963*c217d954SCole Faust
2964*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2965*c217d954SCole Faust
2966*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2967*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2968*c217d954SCole Faust    LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2969*c217d954SCole Faust
2970*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2971*c217d954SCole Faust
2972*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2973*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2974*c217d954SCole Faust    LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2975*c217d954SCole Faust
2976*c217d954SCole Faust#else
2977*c217d954SCole Faust
2978*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2979*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2980*c217d954SCole Faust    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2981*c217d954SCole Faust
2982*c217d954SCole Faust#endif
2983*c217d954SCole Faust
2984*c217d954SCole Faust
2985*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2986*c217d954SCole Faust    BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
2987*c217d954SCole Faust
2988*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2989*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2990*c217d954SCole Faust    BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
2991*c217d954SCole Faust
2992*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2993*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2994*c217d954SCole Faust    BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
2995*c217d954SCole Faust
2996*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2997*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2998*c217d954SCole Faust    BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
2999*c217d954SCole Faust
3000*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3001*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3002*c217d954SCole Faust    BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
3003*c217d954SCole Faust
3004*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3005*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3006*c217d954SCole Faust    BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
3007*c217d954SCole Faust
3008*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3009*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3010*c217d954SCole Faust    BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
3011*c217d954SCole Faust
3012*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3013*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3014*c217d954SCole Faust    BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
3015*c217d954SCole Faust
3016*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3017*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3018*c217d954SCole Faust    BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
3019*c217d954SCole Faust
3020*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3021*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)      \
3022*c217d954SCole Faust    BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
3023*c217d954SCole Faust
3024*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3025*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3026*c217d954SCole Faust    BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
3027*c217d954SCole Faust
3028*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3029*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3030*c217d954SCole Faust    BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
3031*c217d954SCole Faust
3032*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3033*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3034*c217d954SCole Faust    BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
3035*c217d954SCole Faust
3036*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3037*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3038*c217d954SCole Faust    BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
3039*c217d954SCole Faust
3040*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3041*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3042*c217d954SCole Faust    BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
3043*c217d954SCole Faust
3044*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3045*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3046*c217d954SCole Faust    BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
3047*c217d954SCole Faust
3048*c217d954SCole Faust
3049*c217d954SCole Faust
3050*c217d954SCole Faust#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3051*c217d954SCole Faust#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3052*c217d954SCole Faust
3053*c217d954SCole Faust
3054*c217d954SCole Faust
3055*c217d954SCole Faust#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3056*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3057*c217d954SCole Faust    BASENAME##0;                                                                            \
3058*c217d954SCole Faust    if(Y_MASK##0 != 0)                                                                      \
3059*c217d954SCole Faust        BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
3060*c217d954SCole Faust    else                                                                                    \
3061*c217d954SCole Faust        BASENAME##0 = 0;
3062*c217d954SCole Faust
3063*c217d954SCole Faust#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3064*c217d954SCole Faust    LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3065*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3066*c217d954SCole Faust    BASENAME##1;                                                                            \
3067*c217d954SCole Faust    if(Y_MASK##1 != 0)                                                                      \
3068*c217d954SCole Faust        BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
3069*c217d954SCole Faust    else                                                                                    \
3070*c217d954SCole Faust        BASENAME##1 = 0;
3071*c217d954SCole Faust
3072*c217d954SCole Faust#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3073*c217d954SCole Faust    LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3074*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3075*c217d954SCole Faust    BASENAME##2;                                                                            \
3076*c217d954SCole Faust    if(Y_MASK##2 != 0)                                                                      \
3077*c217d954SCole Faust        BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
3078*c217d954SCole Faust    else                                                                                    \
3079*c217d954SCole Faust        BASENAME##2 = 0;
3080*c217d954SCole Faust
3081*c217d954SCole Faust#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3082*c217d954SCole Faust    LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3083*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3084*c217d954SCole Faust    BASENAME##3;                                                                            \
3085*c217d954SCole Faust    if(Y_MASK##3 != 0)                                                                      \
3086*c217d954SCole Faust        BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
3087*c217d954SCole Faust    else                                                                                    \
3088*c217d954SCole Faust        BASENAME##3 = 0;
3089*c217d954SCole Faust
3090*c217d954SCole Faust#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3091*c217d954SCole Faust    LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3092*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3093*c217d954SCole Faust    BASENAME##4;                                                                            \
3094*c217d954SCole Faust    if(Y_MASK##4 != 0)                                                                      \
3095*c217d954SCole Faust        BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
3096*c217d954SCole Faust    else                                                                                    \
3097*c217d954SCole Faust        BASENAME##4 = 0;
3098*c217d954SCole Faust
3099*c217d954SCole Faust#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3100*c217d954SCole Faust    LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3101*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3102*c217d954SCole Faust    BASENAME##5;                                                                            \
3103*c217d954SCole Faust    if(Y_MASK##5 != 0)                                                                      \
3104*c217d954SCole Faust        BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
3105*c217d954SCole Faust    else                                                                                    \
3106*c217d954SCole Faust        BASENAME##5 = 0;
3107*c217d954SCole Faust
3108*c217d954SCole Faust#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3109*c217d954SCole Faust    LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3110*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3111*c217d954SCole Faust    BASENAME##6;                                                                            \
3112*c217d954SCole Faust    if(Y_MASK##6 != 0)                                                                      \
3113*c217d954SCole Faust        BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
3114*c217d954SCole Faust    else                                                                                    \
3115*c217d954SCole Faust        BASENAME##6 = 0;
3116*c217d954SCole Faust
3117*c217d954SCole Faust#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3118*c217d954SCole Faust    LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3119*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3120*c217d954SCole Faust    BASENAME##7;                                                                            \
3121*c217d954SCole Faust    if(Y_MASK##7 != 0)                                                                      \
3122*c217d954SCole Faust        BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
3123*c217d954SCole Faust    else                                                                                    \
3124*c217d954SCole Faust        BASENAME##7 = 0;
3125*c217d954SCole Faust
3126*c217d954SCole Faust#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3127*c217d954SCole Faust    LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3128*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3129*c217d954SCole Faust    BASENAME##8;                                                                            \
3130*c217d954SCole Faust    if(Y_MASK##8 != 0)                                                                      \
3131*c217d954SCole Faust        BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
3132*c217d954SCole Faust    else                                                                                    \
3133*c217d954SCole Faust        BASENAME##8 = 0;
3134*c217d954SCole Faust
3135*c217d954SCole Faust#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3136*c217d954SCole Faust    LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3137*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3138*c217d954SCole Faust    BASENAME##9;                                                                            \
3139*c217d954SCole Faust    if(Y_MASK##9 != 0)                                                                      \
3140*c217d954SCole Faust        BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
3141*c217d954SCole Faust    else                                                                                    \
3142*c217d954SCole Faust        BASENAME##9 = 0;
3143*c217d954SCole Faust
3144*c217d954SCole Faust#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3145*c217d954SCole Faust    LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3146*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3147*c217d954SCole Faust    BASENAME##A;                                                                            \
3148*c217d954SCole Faust    if(Y_MASK##A != 0)                                                                      \
3149*c217d954SCole Faust        BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
3150*c217d954SCole Faust    else                                                                                    \
3151*c217d954SCole Faust        BASENAME##A = 0;
3152*c217d954SCole Faust
3153*c217d954SCole Faust#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3154*c217d954SCole Faust    LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3155*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3156*c217d954SCole Faust    BASENAME##B;                                                                            \
3157*c217d954SCole Faust    if(Y_MASK##B != 0)                                                                      \
3158*c217d954SCole Faust        BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
3159*c217d954SCole Faust    else                                                                                    \
3160*c217d954SCole Faust        BASENAME##B = 0;
3161*c217d954SCole Faust
3162*c217d954SCole Faust#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3163*c217d954SCole Faust    LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3164*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3165*c217d954SCole Faust    BASENAME##C;                                                                            \
3166*c217d954SCole Faust    if(Y_MASK##C != 0)                                                                      \
3167*c217d954SCole Faust        BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
3168*c217d954SCole Faust    else                                                                                    \
3169*c217d954SCole Faust        BASENAME##C = 0;
3170*c217d954SCole Faust
3171*c217d954SCole Faust#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3172*c217d954SCole Faust    LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3173*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3174*c217d954SCole Faust    BASENAME##D;                                                                            \
3175*c217d954SCole Faust    if(Y_MASK##D != 0)                                                                      \
3176*c217d954SCole Faust        BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
3177*c217d954SCole Faust    else                                                                                    \
3178*c217d954SCole Faust        BASENAME##D = 0;
3179*c217d954SCole Faust
3180*c217d954SCole Faust#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3181*c217d954SCole Faust    LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3182*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3183*c217d954SCole Faust    BASENAME##E;                                                                            \
3184*c217d954SCole Faust    if(Y_MASK##E != 0)                                                                      \
3185*c217d954SCole Faust        BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
3186*c217d954SCole Faust    else                                                                                    \
3187*c217d954SCole Faust        BASENAME##E = 0;
3188*c217d954SCole Faust
3189*c217d954SCole Faust#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3190*c217d954SCole Faust    LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3191*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3192*c217d954SCole Faust    BASENAME##F;                                                                            \
3193*c217d954SCole Faust    if(Y_MASK##F != 0)                                                                      \
3194*c217d954SCole Faust        BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
3195*c217d954SCole Faust    else                                                                                    \
3196*c217d954SCole Faust        BASENAME##F = 0;
3197*c217d954SCole Faust
3198*c217d954SCole Faust
3199*c217d954SCole Faust#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3200*c217d954SCole Faust#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3201*c217d954SCole Faust
3202*c217d954SCole Faust
3203*c217d954SCole Faust#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3204*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3205*c217d954SCole Faust    BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
3206*c217d954SCole Faust
3207*c217d954SCole Faust#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3208*c217d954SCole Faust    LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3209*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3210*c217d954SCole Faust    BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
3211*c217d954SCole Faust
3212*c217d954SCole Faust#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3213*c217d954SCole Faust    LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3214*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3215*c217d954SCole Faust    BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
3216*c217d954SCole Faust
3217*c217d954SCole Faust#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3218*c217d954SCole Faust    LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3219*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3220*c217d954SCole Faust    BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
3221*c217d954SCole Faust
3222*c217d954SCole Faust#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3223*c217d954SCole Faust    LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3224*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3225*c217d954SCole Faust    BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
3226*c217d954SCole Faust
3227*c217d954SCole Faust#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3228*c217d954SCole Faust    LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3229*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3230*c217d954SCole Faust    BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
3231*c217d954SCole Faust
3232*c217d954SCole Faust#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3233*c217d954SCole Faust    LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3234*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3235*c217d954SCole Faust    BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
3236*c217d954SCole Faust
3237*c217d954SCole Faust#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3238*c217d954SCole Faust    LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3239*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3240*c217d954SCole Faust    BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
3241*c217d954SCole Faust
3242*c217d954SCole Faust#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3243*c217d954SCole Faust    LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3244*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3245*c217d954SCole Faust    BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
3246*c217d954SCole Faust
3247*c217d954SCole Faust#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3248*c217d954SCole Faust    LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)      \
3249*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3250*c217d954SCole Faust    BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
3251*c217d954SCole Faust
3252*c217d954SCole Faust#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3253*c217d954SCole Faust    LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3254*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3255*c217d954SCole Faust    BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
3256*c217d954SCole Faust
3257*c217d954SCole Faust#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3258*c217d954SCole Faust    LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3259*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3260*c217d954SCole Faust    BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
3261*c217d954SCole Faust
3262*c217d954SCole Faust#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3263*c217d954SCole Faust    LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3264*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3265*c217d954SCole Faust    BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
3266*c217d954SCole Faust
3267*c217d954SCole Faust#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3268*c217d954SCole Faust    LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3269*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3270*c217d954SCole Faust    BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
3271*c217d954SCole Faust
3272*c217d954SCole Faust#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3273*c217d954SCole Faust    LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3274*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3275*c217d954SCole Faust    BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
3276*c217d954SCole Faust
3277*c217d954SCole Faust#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3278*c217d954SCole Faust    LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3279*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3280*c217d954SCole Faust    BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
3281*c217d954SCole Faust
3282*c217d954SCole Faust
3283*c217d954SCole Faust
3284*c217d954SCole Faust
3285*c217d954SCole Faust#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3286*c217d954SCole Faust#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3287*c217d954SCole Faust
3288*c217d954SCole Faust
3289*c217d954SCole Faust
3290*c217d954SCole Faust#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3291*c217d954SCole Faust    Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3292*c217d954SCole Faust    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
3293*c217d954SCole Faust    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
3294*c217d954SCole Faust
3295*c217d954SCole Faust#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3296*c217d954SCole Faust    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3297*c217d954SCole Faust    Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3298*c217d954SCole Faust    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
3299*c217d954SCole Faust    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
3300*c217d954SCole Faust
3301*c217d954SCole Faust#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3302*c217d954SCole Faust    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3303*c217d954SCole Faust    Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3304*c217d954SCole Faust    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
3305*c217d954SCole Faust    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
3306*c217d954SCole Faust
3307*c217d954SCole Faust#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3308*c217d954SCole Faust    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3309*c217d954SCole Faust    Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3310*c217d954SCole Faust    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
3311*c217d954SCole Faust    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
3312*c217d954SCole Faust
3313*c217d954SCole Faust#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3314*c217d954SCole Faust    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3315*c217d954SCole Faust    Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3316*c217d954SCole Faust    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
3317*c217d954SCole Faust    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
3318*c217d954SCole Faust
3319*c217d954SCole Faust#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3320*c217d954SCole Faust    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3321*c217d954SCole Faust    Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3322*c217d954SCole Faust    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
3323*c217d954SCole Faust    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
3324*c217d954SCole Faust
3325*c217d954SCole Faust#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3326*c217d954SCole Faust    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3327*c217d954SCole Faust    Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3328*c217d954SCole Faust    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
3329*c217d954SCole Faust    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
3330*c217d954SCole Faust
3331*c217d954SCole Faust#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3332*c217d954SCole Faust    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3333*c217d954SCole Faust    Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3334*c217d954SCole Faust    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
3335*c217d954SCole Faust    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
3336*c217d954SCole Faust
3337*c217d954SCole Faust
3338*c217d954SCole Faust
3339*c217d954SCole Faust
3340*c217d954SCole Faust#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3341*c217d954SCole Faust#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3342*c217d954SCole Faust
3343*c217d954SCole Faust
3344*c217d954SCole Faust
3345*c217d954SCole Faust#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
3346*c217d954SCole Faust    BASENAME##0 *= (DATA_TYPE)SCALE;
3347*c217d954SCole Faust
3348*c217d954SCole Faust#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
3349*c217d954SCole Faust    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
3350*c217d954SCole Faust    BASENAME##1 *= (DATA_TYPE)SCALE;
3351*c217d954SCole Faust
3352*c217d954SCole Faust#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
3353*c217d954SCole Faust    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
3354*c217d954SCole Faust    BASENAME##2 *= (DATA_TYPE)SCALE;
3355*c217d954SCole Faust
3356*c217d954SCole Faust#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
3357*c217d954SCole Faust    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
3358*c217d954SCole Faust    BASENAME##3 *= (DATA_TYPE)SCALE;
3359*c217d954SCole Faust
3360*c217d954SCole Faust#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
3361*c217d954SCole Faust    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
3362*c217d954SCole Faust    BASENAME##4 *= (DATA_TYPE)SCALE;
3363*c217d954SCole Faust
3364*c217d954SCole Faust#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
3365*c217d954SCole Faust    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
3366*c217d954SCole Faust    BASENAME##5 *= (DATA_TYPE)SCALE;
3367*c217d954SCole Faust
3368*c217d954SCole Faust#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
3369*c217d954SCole Faust    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
3370*c217d954SCole Faust    BASENAME##6 *= (DATA_TYPE)SCALE;
3371*c217d954SCole Faust
3372*c217d954SCole Faust#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
3373*c217d954SCole Faust    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
3374*c217d954SCole Faust    BASENAME##7 *= (DATA_TYPE)SCALE;
3375*c217d954SCole Faust
3376*c217d954SCole Faust#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
3377*c217d954SCole Faust    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
3378*c217d954SCole Faust    BASENAME##8 *= (DATA_TYPE)SCALE;
3379*c217d954SCole Faust
3380*c217d954SCole Faust#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
3381*c217d954SCole Faust    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
3382*c217d954SCole Faust    BASENAME##9 *= (DATA_TYPE)SCALE;
3383*c217d954SCole Faust
3384*c217d954SCole Faust#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
3385*c217d954SCole Faust    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
3386*c217d954SCole Faust    BASENAME##A *= (DATA_TYPE)SCALE;
3387*c217d954SCole Faust
3388*c217d954SCole Faust#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
3389*c217d954SCole Faust    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
3390*c217d954SCole Faust    BASENAME##B *= (DATA_TYPE)SCALE;
3391*c217d954SCole Faust
3392*c217d954SCole Faust#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
3393*c217d954SCole Faust    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
3394*c217d954SCole Faust    BASENAME##C *= (DATA_TYPE)SCALE;
3395*c217d954SCole Faust
3396*c217d954SCole Faust#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
3397*c217d954SCole Faust    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
3398*c217d954SCole Faust    BASENAME##D *= (DATA_TYPE)SCALE;
3399*c217d954SCole Faust
3400*c217d954SCole Faust#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
3401*c217d954SCole Faust    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
3402*c217d954SCole Faust    BASENAME##E *= (DATA_TYPE)SCALE;
3403*c217d954SCole Faust
3404*c217d954SCole Faust#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
3405*c217d954SCole Faust    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
3406*c217d954SCole Faust    BASENAME##F *= (DATA_TYPE)SCALE;
3407*c217d954SCole Faust
3408*c217d954SCole Faust
3409*c217d954SCole Faust
3410*c217d954SCole Faust#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
3411*c217d954SCole Faust#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
3412*c217d954SCole Faust
3413*c217d954SCole Faust
3414*c217d954SCole Faust
3415*c217d954SCole Faust#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
3416*c217d954SCole Faust    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
3417*c217d954SCole Faust#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
3418*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 2)                         \
3419*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
3420*c217d954SCole Faust#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
3421*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 3)                         \
3422*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
3423*c217d954SCole Faust#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
3424*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 4)                         \
3425*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
3426*c217d954SCole Faust#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
3427*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 8)                         \
3428*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
3429*c217d954SCole Faust#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
3430*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 16)                         \
3431*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
3432*c217d954SCole Faust
3433*c217d954SCole Faust
3434*c217d954SCole Faust
3435*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
3436*c217d954SCole Faust    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
3437*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
3438*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 2)                                \
3439*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
3440*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
3441*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 3)                                \
3442*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
3443*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
3444*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 4)                                \
3445*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
3446*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
3447*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 8)                                \
3448*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
3449*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
3450*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 16)                                \
3451*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
3452*c217d954SCole Faust
3453*c217d954SCole Faust
3454*c217d954SCole Faust
3455*c217d954SCole Faust#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
3456*c217d954SCole Faust    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
3457*c217d954SCole Faust#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
3458*c217d954SCole Faust    COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
3459*c217d954SCole Faust    COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
3460*c217d954SCole Faust#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
3461*c217d954SCole Faust    TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE);    \
3462*c217d954SCole Faust    COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
3463*c217d954SCole Faust#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
3464*c217d954SCole Faust    TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE);    \
3465*c217d954SCole Faust    COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
3466*c217d954SCole Faust#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
3467*c217d954SCole Faust    TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE);    \
3468*c217d954SCole Faust    COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE);  \
3469*c217d954SCole Faust    COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE);  \
3470*c217d954SCole Faust    COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE);  \
3471*c217d954SCole Faust    COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
3472*c217d954SCole Faust#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
3473*c217d954SCole Faust    TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE);     \
3474*c217d954SCole Faust    COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE);   \
3475*c217d954SCole Faust    COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE);   \
3476*c217d954SCole Faust    COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE);   \
3477*c217d954SCole Faust    COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE);   \
3478*c217d954SCole Faust    COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE);   \
3479*c217d954SCole Faust    COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE);   \
3480*c217d954SCole Faust    COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE);   \
3481*c217d954SCole Faust    COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
3482*c217d954SCole Faust
3483*c217d954SCole Faust
3484*c217d954SCole Faust
3485*c217d954SCole Faust
3486*c217d954SCole Faust#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
3487*c217d954SCole Faust    CONCAT(COLUMN_VECTOR, K0)                          \
3488*c217d954SCole Faust    (IDX_COL, BASENAME, BS, TYPE);
3489*c217d954SCole Faust
3490*c217d954SCole Faust
3491*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
3492*c217d954SCole Faust    CONCAT(COLUMN_VECTOR_SCALAR, K0)                          \
3493*c217d954SCole Faust    (IDX_COL, BASENAME, BS, TYPE);
3494*c217d954SCole Faust
3495*c217d954SCole Faust
3496*c217d954SCole Faust#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
3497*c217d954SCole Faust    CONCAT(TRANSPOSE_K0X, N0)                       \
3498*c217d954SCole Faust    (K0, BASENAME, BS, TYPE);
3499*c217d954SCole Faust
3500*c217d954SCole Faust
3501*c217d954SCole Faust#define ADD_ROW_1(BASENAME, BIAS) \
3502*c217d954SCole Faust    BASENAME##0 += BIAS##0;
3503*c217d954SCole Faust
3504*c217d954SCole Faust#define ADD_ROW_2(BASENAME, BIAS) \
3505*c217d954SCole Faust    ADD_ROW_1(BASENAME, BIAS)     \
3506*c217d954SCole Faust    BASENAME##1 += BIAS##1;
3507*c217d954SCole Faust
3508*c217d954SCole Faust#define ADD_ROW_3(BASENAME, BIAS) \
3509*c217d954SCole Faust    ADD_ROW_2(BASENAME, BIAS)     \
3510*c217d954SCole Faust    BASENAME##2 += BIAS##2;
3511*c217d954SCole Faust
3512*c217d954SCole Faust#define ADD_ROW_4(BASENAME, BIAS) \
3513*c217d954SCole Faust    ADD_ROW_3(BASENAME, BIAS)     \
3514*c217d954SCole Faust    BASENAME##3 += BIAS##3;
3515*c217d954SCole Faust
3516*c217d954SCole Faust#define ADD_ROW_5(BASENAME, BIAS) \
3517*c217d954SCole Faust    ADD_ROW_4(BASENAME, BIAS)     \
3518*c217d954SCole Faust    BASENAME##4 += BIAS##4;
3519*c217d954SCole Faust
3520*c217d954SCole Faust#define ADD_ROW_6(BASENAME, BIAS) \
3521*c217d954SCole Faust    ADD_ROW_5(BASENAME, BIAS)     \
3522*c217d954SCole Faust    BASENAME##5 += BIAS##5;
3523*c217d954SCole Faust
3524*c217d954SCole Faust#define ADD_ROW_7(BASENAME, BIAS) \
3525*c217d954SCole Faust    ADD_ROW_6(BASENAME, BIAS)     \
3526*c217d954SCole Faust    BASENAME##6 += BIAS##6;
3527*c217d954SCole Faust
3528*c217d954SCole Faust#define ADD_ROW_8(BASENAME, BIAS) \
3529*c217d954SCole Faust    ADD_ROW_7(BASENAME, BIAS)     \
3530*c217d954SCole Faust    BASENAME##7 += BIAS##7;
3531*c217d954SCole Faust
3532*c217d954SCole Faust#define ADD_ROW_9(BASENAME, BIAS) \
3533*c217d954SCole Faust    ADD_ROW_8(BASENAME, BIAS)     \
3534*c217d954SCole Faust    BASENAME##8 += BIAS##8;
3535*c217d954SCole Faust
3536*c217d954SCole Faust#define ADD_ROW_10(BASENAME, BIAS) \
3537*c217d954SCole Faust    ADD_ROW_9(BASENAME, BIAS)      \
3538*c217d954SCole Faust    BASENAME##9 += BIAS##9;
3539*c217d954SCole Faust
3540*c217d954SCole Faust#define ADD_ROW_11(BASENAME, BIAS) \
3541*c217d954SCole Faust    ADD_ROW_10(BASENAME, BIAS)     \
3542*c217d954SCole Faust    BASENAME##A += BIAS##A;
3543*c217d954SCole Faust
3544*c217d954SCole Faust#define ADD_ROW_12(BASENAME, BIAS) \
3545*c217d954SCole Faust    ADD_ROW_11(BASENAME, BIAS)     \
3546*c217d954SCole Faust    BASENAME##B += BIAS##B;
3547*c217d954SCole Faust
3548*c217d954SCole Faust#define ADD_ROW_13(BASENAME, BIAS) \
3549*c217d954SCole Faust    ADD_ROW_12(BASENAME, BIAS)     \
3550*c217d954SCole Faust    BASENAME##C += BIAS##C;
3551*c217d954SCole Faust
3552*c217d954SCole Faust#define ADD_ROW_14(BASENAME, BIAS) \
3553*c217d954SCole Faust    ADD_ROW_13(BASENAME, BIAS)     \
3554*c217d954SCole Faust    BASENAME##D += BIAS##D;
3555*c217d954SCole Faust
3556*c217d954SCole Faust#define ADD_ROW_15(BASENAME, BIAS) \
3557*c217d954SCole Faust    ADD_ROW_14(BASENAME, BIAS)     \
3558*c217d954SCole Faust    BASENAME##E += BIAS##E;
3559*c217d954SCole Faust
3560*c217d954SCole Faust#define ADD_ROW_16(BASENAME, BIAS) \
3561*c217d954SCole Faust    ADD_ROW_15(BASENAME, BIAS)     \
3562*c217d954SCole Faust    BASENAME##F += BIAS##F;
3563*c217d954SCole Faust
3564*c217d954SCole Faust
3565*c217d954SCole Faust
3566*c217d954SCole Faust
3567*c217d954SCole Faust#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
3568*c217d954SCole Faust#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
3569*c217d954SCole Faust
3570*c217d954SCole Faust
3571*c217d954SCole Faust
3572*c217d954SCole Faust#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
3573*c217d954SCole Faust    BASENAME##0 += BIAS;
3574*c217d954SCole Faust
3575*c217d954SCole Faust#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
3576*c217d954SCole Faust    ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
3577*c217d954SCole Faust    BASENAME##1 += BIAS;
3578*c217d954SCole Faust
3579*c217d954SCole Faust#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
3580*c217d954SCole Faust    ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
3581*c217d954SCole Faust    BASENAME##2 += BIAS;
3582*c217d954SCole Faust
3583*c217d954SCole Faust#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
3584*c217d954SCole Faust    ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
3585*c217d954SCole Faust    BASENAME##3 += BIAS;
3586*c217d954SCole Faust
3587*c217d954SCole Faust#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
3588*c217d954SCole Faust    ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
3589*c217d954SCole Faust    BASENAME##4 += BIAS;
3590*c217d954SCole Faust
3591*c217d954SCole Faust#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
3592*c217d954SCole Faust    ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
3593*c217d954SCole Faust    BASENAME##5 += BIAS;
3594*c217d954SCole Faust
3595*c217d954SCole Faust#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
3596*c217d954SCole Faust    ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
3597*c217d954SCole Faust    BASENAME##6 += BIAS;
3598*c217d954SCole Faust
3599*c217d954SCole Faust#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
3600*c217d954SCole Faust    ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
3601*c217d954SCole Faust    BASENAME##7 += BIAS;
3602*c217d954SCole Faust
3603*c217d954SCole Faust#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
3604*c217d954SCole Faust    ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
3605*c217d954SCole Faust    BASENAME##8 += BIAS;
3606*c217d954SCole Faust
3607*c217d954SCole Faust#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
3608*c217d954SCole Faust    ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
3609*c217d954SCole Faust    BASENAME##9 += BIAS;
3610*c217d954SCole Faust
3611*c217d954SCole Faust#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
3612*c217d954SCole Faust    ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
3613*c217d954SCole Faust    BASENAME##A += BIAS;
3614*c217d954SCole Faust
3615*c217d954SCole Faust#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
3616*c217d954SCole Faust    ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
3617*c217d954SCole Faust    BASENAME##B += BIAS;
3618*c217d954SCole Faust
3619*c217d954SCole Faust#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
3620*c217d954SCole Faust    ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
3621*c217d954SCole Faust    BASENAME##C += BIAS;
3622*c217d954SCole Faust
3623*c217d954SCole Faust#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
3624*c217d954SCole Faust    ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
3625*c217d954SCole Faust    BASENAME##D += BIAS;
3626*c217d954SCole Faust
3627*c217d954SCole Faust#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
3628*c217d954SCole Faust    ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
3629*c217d954SCole Faust    BASENAME##E += BIAS;
3630*c217d954SCole Faust
3631*c217d954SCole Faust#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
3632*c217d954SCole Faust    ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
3633*c217d954SCole Faust    BASENAME##F += BIAS;
3634*c217d954SCole Faust
3635*c217d954SCole Faust
3636*c217d954SCole Faust#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
3637*c217d954SCole Faust#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
3638*c217d954SCole Faust
3639*c217d954SCole Faust
3640*c217d954SCole Faust
3641*c217d954SCole Faust#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3642*c217d954SCole Faust    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
3643*c217d954SCole Faust
3644*c217d954SCole Faust#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3645*c217d954SCole Faust    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3646*c217d954SCole Faust    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
3647*c217d954SCole Faust
3648*c217d954SCole Faust#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3649*c217d954SCole Faust    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3650*c217d954SCole Faust    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
3651*c217d954SCole Faust
3652*c217d954SCole Faust#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3653*c217d954SCole Faust    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3654*c217d954SCole Faust    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
3655*c217d954SCole Faust
3656*c217d954SCole Faust#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3657*c217d954SCole Faust    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3658*c217d954SCole Faust    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
3659*c217d954SCole Faust
3660*c217d954SCole Faust#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3661*c217d954SCole Faust    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3662*c217d954SCole Faust    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
3663*c217d954SCole Faust
3664*c217d954SCole Faust#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3665*c217d954SCole Faust    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3666*c217d954SCole Faust    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
3667*c217d954SCole Faust
3668*c217d954SCole Faust#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3669*c217d954SCole Faust    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3670*c217d954SCole Faust    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
3671*c217d954SCole Faust
3672*c217d954SCole Faust#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3673*c217d954SCole Faust    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3674*c217d954SCole Faust    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
3675*c217d954SCole Faust
3676*c217d954SCole Faust#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3677*c217d954SCole Faust    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)      \
3678*c217d954SCole Faust    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
3679*c217d954SCole Faust
3680*c217d954SCole Faust#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3681*c217d954SCole Faust    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3682*c217d954SCole Faust    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
3683*c217d954SCole Faust
3684*c217d954SCole Faust#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3685*c217d954SCole Faust    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3686*c217d954SCole Faust    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
3687*c217d954SCole Faust
3688*c217d954SCole Faust#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3689*c217d954SCole Faust    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3690*c217d954SCole Faust    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
3691*c217d954SCole Faust
3692*c217d954SCole Faust#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3693*c217d954SCole Faust    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3694*c217d954SCole Faust    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
3695*c217d954SCole Faust
3696*c217d954SCole Faust#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3697*c217d954SCole Faust    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3698*c217d954SCole Faust    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
3699*c217d954SCole Faust
3700*c217d954SCole Faust#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3701*c217d954SCole Faust    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3702*c217d954SCole Faust    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
3703*c217d954SCole Faust
3704*c217d954SCole Faust
3705*c217d954SCole Faust
3706*c217d954SCole Faust#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3707*c217d954SCole Faust#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3708*c217d954SCole Faust
3709*c217d954SCole Faust
3710*c217d954SCole Faust
3711*c217d954SCole Faust#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3712*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3713*c217d954SCole Faust    BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
3714*c217d954SCole Faust
3715*c217d954SCole Faust#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3716*c217d954SCole Faust    CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3717*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3718*c217d954SCole Faust    BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
3719*c217d954SCole Faust
3720*c217d954SCole Faust#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3721*c217d954SCole Faust    CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3722*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3723*c217d954SCole Faust    BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
3724*c217d954SCole Faust
3725*c217d954SCole Faust#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3726*c217d954SCole Faust    CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3727*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3728*c217d954SCole Faust    BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
3729*c217d954SCole Faust
3730*c217d954SCole Faust#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3731*c217d954SCole Faust    CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3732*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3733*c217d954SCole Faust    BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
3734*c217d954SCole Faust
3735*c217d954SCole Faust#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3736*c217d954SCole Faust    CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3737*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3738*c217d954SCole Faust    BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
3739*c217d954SCole Faust
3740*c217d954SCole Faust#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3741*c217d954SCole Faust    CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3742*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3743*c217d954SCole Faust    BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
3744*c217d954SCole Faust
3745*c217d954SCole Faust#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3746*c217d954SCole Faust    CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3747*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3748*c217d954SCole Faust    BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
3749*c217d954SCole Faust
3750*c217d954SCole Faust#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3751*c217d954SCole Faust    CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3752*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3753*c217d954SCole Faust    BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
3754*c217d954SCole Faust
3755*c217d954SCole Faust#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3756*c217d954SCole Faust    CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
3757*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3758*c217d954SCole Faust    BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
3759*c217d954SCole Faust
3760*c217d954SCole Faust#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3761*c217d954SCole Faust    CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3762*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3763*c217d954SCole Faust    BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
3764*c217d954SCole Faust
3765*c217d954SCole Faust#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3766*c217d954SCole Faust    CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3767*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3768*c217d954SCole Faust    BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
3769*c217d954SCole Faust
3770*c217d954SCole Faust#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3771*c217d954SCole Faust    CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3772*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3773*c217d954SCole Faust    BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
3774*c217d954SCole Faust
3775*c217d954SCole Faust#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3776*c217d954SCole Faust    CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3777*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3778*c217d954SCole Faust    BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
3779*c217d954SCole Faust
3780*c217d954SCole Faust#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3781*c217d954SCole Faust    CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3782*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3783*c217d954SCole Faust    BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
3784*c217d954SCole Faust
3785*c217d954SCole Faust#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3786*c217d954SCole Faust    CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3787*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3788*c217d954SCole Faust    BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
3789*c217d954SCole Faust
3790*c217d954SCole Faust
3791*c217d954SCole Faust
3792*c217d954SCole Faust#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3793*c217d954SCole Faust#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3794*c217d954SCole Faust
3795*c217d954SCole Faust
3796*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
3797*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
3798*c217d954SCole Faust
3799*c217d954SCole Faust
3800*c217d954SCole Faust
3801*c217d954SCole Faust
3802*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3803*c217d954SCole Faust    VSTORE(N0)                                                 \
3804*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3805*c217d954SCole Faust
3806*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3807*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3808*c217d954SCole Faust    VSTORE(N0)                                                 \
3809*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3810*c217d954SCole Faust
3811*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3812*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3813*c217d954SCole Faust    VSTORE(N0)                                                 \
3814*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3815*c217d954SCole Faust
3816*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3817*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3818*c217d954SCole Faust    VSTORE(N0)                                                 \
3819*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3820*c217d954SCole Faust
3821*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3822*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3823*c217d954SCole Faust    VSTORE(N0)                                                 \
3824*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3825*c217d954SCole Faust
3826*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3827*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3828*c217d954SCole Faust    VSTORE(N0)                                                 \
3829*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3830*c217d954SCole Faust
3831*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3832*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3833*c217d954SCole Faust    VSTORE(N0)                                                 \
3834*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3835*c217d954SCole Faust
3836*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3837*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3838*c217d954SCole Faust    VSTORE(N0)                                                 \
3839*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3840*c217d954SCole Faust
3841*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3842*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3843*c217d954SCole Faust    VSTORE(N0)                                                 \
3844*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3845*c217d954SCole Faust
3846*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3847*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
3848*c217d954SCole Faust    VSTORE(N0)                                                  \
3849*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3850*c217d954SCole Faust
3851*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3852*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3853*c217d954SCole Faust    VSTORE(N0)                                                  \
3854*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3855*c217d954SCole Faust
3856*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3857*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3858*c217d954SCole Faust    VSTORE(N0)                                                  \
3859*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3860*c217d954SCole Faust
3861*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3862*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3863*c217d954SCole Faust    VSTORE(N0)                                                  \
3864*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3865*c217d954SCole Faust
3866*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3867*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3868*c217d954SCole Faust    VSTORE(N0)                                                  \
3869*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3870*c217d954SCole Faust
3871*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3872*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3873*c217d954SCole Faust    VSTORE(N0)                                                  \
3874*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3875*c217d954SCole Faust
3876*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3877*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3878*c217d954SCole Faust    VSTORE(N0)                                                  \
3879*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3880*c217d954SCole Faust
3881*c217d954SCole Faust
3882*c217d954SCole Faust
3883*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3884*c217d954SCole Faust    VSTORE(N0)                                                         \
3885*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3886*c217d954SCole Faust
3887*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3888*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3889*c217d954SCole Faust    VSTORE(N0)                                                         \
3890*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3891*c217d954SCole Faust
3892*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3893*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3894*c217d954SCole Faust    VSTORE(N0)                                                         \
3895*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3896*c217d954SCole Faust
3897*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3898*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3899*c217d954SCole Faust    VSTORE(N0)                                                         \
3900*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3901*c217d954SCole Faust
3902*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3903*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3904*c217d954SCole Faust    VSTORE(N0)                                                         \
3905*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3906*c217d954SCole Faust
3907*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3908*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3909*c217d954SCole Faust    VSTORE(N0)                                                         \
3910*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3911*c217d954SCole Faust
3912*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3913*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3914*c217d954SCole Faust    VSTORE(N0)                                                         \
3915*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3916*c217d954SCole Faust
3917*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3918*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3919*c217d954SCole Faust    VSTORE(N0)                                                         \
3920*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3921*c217d954SCole Faust
3922*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3923*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3924*c217d954SCole Faust    VSTORE(N0)                                                         \
3925*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3926*c217d954SCole Faust
3927*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
3928*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3929*c217d954SCole Faust    VSTORE(N0)                                                     \
3930*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3931*c217d954SCole Faust
3932*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3933*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3934*c217d954SCole Faust    VSTORE(N0)                                                          \
3935*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3936*c217d954SCole Faust
3937*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3938*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3939*c217d954SCole Faust    VSTORE(N0)                                                          \
3940*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3941*c217d954SCole Faust
3942*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3943*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3944*c217d954SCole Faust    VSTORE(N0)                                                          \
3945*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3946*c217d954SCole Faust
3947*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3948*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3949*c217d954SCole Faust    VSTORE(N0)                                                          \
3950*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3951*c217d954SCole Faust
3952*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3953*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3954*c217d954SCole Faust    VSTORE(N0)                                                          \
3955*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3956*c217d954SCole Faust
3957*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3958*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3959*c217d954SCole Faust    VSTORE(N0)                                                          \
3960*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3961*c217d954SCole Faust
3962*c217d954SCole Faust
3963*c217d954SCole Faust
3964*c217d954SCole Faust
3965*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3966*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3967*c217d954SCole Faust
3968*c217d954SCole Faust
3969*c217d954SCole Faust
3970*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3971*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3972*c217d954SCole Faust
3973*c217d954SCole Faust
3974*c217d954SCole Faust
3975*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3976*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3977*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3978*c217d954SCole Faust
3979*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3980*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3981*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3982*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3983*c217d954SCole Faust
3984*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3985*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3986*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3987*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3988*c217d954SCole Faust
3989*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3990*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3991*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3992*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3993*c217d954SCole Faust
3994*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3995*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3996*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3997*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3998*c217d954SCole Faust
3999*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4000*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4001*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4002*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
4003*c217d954SCole Faust
4004*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4005*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4006*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4007*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
4008*c217d954SCole Faust
4009*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4010*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4011*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4012*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
4013*c217d954SCole Faust
4014*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4015*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4016*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4017*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
4018*c217d954SCole Faust
4019*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4020*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
4021*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4022*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
4023*c217d954SCole Faust
4024*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4025*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4026*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4027*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
4028*c217d954SCole Faust
4029*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4030*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4031*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4032*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
4033*c217d954SCole Faust
4034*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4035*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4036*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4037*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
4038*c217d954SCole Faust
4039*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4040*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4041*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4042*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
4043*c217d954SCole Faust
4044*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4045*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4046*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4047*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
4048*c217d954SCole Faust
4049*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4050*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4051*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4052*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
4053*c217d954SCole Faust
4054*c217d954SCole Faust
4055*c217d954SCole Faust
4056*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4057*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4058*c217d954SCole Faust
4059*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4060*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
4061*c217d954SCole Faust    {                                                                                                                                                     \
4062*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
4063*c217d954SCole Faust    }                                                                                                                                                     \
4064*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
4065*c217d954SCole Faust    {                                                                                                                                                     \
4066*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4067*c217d954SCole Faust    }                                                                                                                                                     \
4068*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
4069*c217d954SCole Faust    {                                                                                                                                                     \
4070*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4071*c217d954SCole Faust    }                                                                                                                                                     \
4072*c217d954SCole Faust    else                                                                                                                                                  \
4073*c217d954SCole Faust    {                                                                                                                                                     \
4074*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
4075*c217d954SCole Faust    }
4076*c217d954SCole Faust
4077*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
4078*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
4079*c217d954SCole Faust    {                                                                                                             \
4080*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4081*c217d954SCole Faust    }                                                                                                             \
4082*c217d954SCole Faust    else                                                                                                          \
4083*c217d954SCole Faust    {                                                                                                             \
4084*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4085*c217d954SCole Faust    }
4086*c217d954SCole Faust
4087*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
4088*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
4089*c217d954SCole Faust    {                                                                                                             \
4090*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4091*c217d954SCole Faust    }                                                                                                             \
4092*c217d954SCole Faust    else                                                                                                          \
4093*c217d954SCole Faust    {                                                                                                             \
4094*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4095*c217d954SCole Faust    }
4096*c217d954SCole Faust
4097*c217d954SCole Faust
4098*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
4099*c217d954SCole Faust
4100*c217d954SCole Faust
4101*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
4102*c217d954SCole Faust
4103*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4104*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4105*c217d954SCole Faust
4106*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
4107*c217d954SCole Faust
4108*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4109*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
4110*c217d954SCole Faust
4111*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
4112*c217d954SCole Faust
4113*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4114*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
4115*c217d954SCole Faust
4116*c217d954SCole Faust#else
4117*c217d954SCole Faust
4118*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4119*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
4120*c217d954SCole Faust
4121*c217d954SCole Faust#endif
4122*c217d954SCole Faust
4123*c217d954SCole Faust#endif
4124*c217d954SCole Faust
4125*c217d954SCole Faust
4126*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
4127*c217d954SCole Faust
4128*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4129*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
4130*c217d954SCole Faust#else
4131*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4132*c217d954SCole Faust    ((uint)(y * M0))
4133*c217d954SCole Faust#endif
4134*c217d954SCole Faust
4135*c217d954SCole Faust
4136*c217d954SCole Faust
4137*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
4138*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
4139*c217d954SCole Faust
4140*c217d954SCole Faust
4141*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4142*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
4143*c217d954SCole Faust#endif
4144*c217d954SCole Faust
4145*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
4146*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
4147*c217d954SCole Faust#endif
4148*c217d954SCole Faust
4149*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
4150*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
4151*c217d954SCole Faust#endif
4152*c217d954SCole Faust
4153*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
4154*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
4155*c217d954SCole Faust#endif
4156*c217d954SCole Faust
4157*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
4158*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
4159*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
4160*c217d954SCole Faust
4161*c217d954SCole Faust
4162*c217d954SCole Faust#define CONCAT(a, b) a##b
4163*c217d954SCole Faust
4164*c217d954SCole Faust
4165*c217d954SCole Faust#define EXPAND(x) x
4166*c217d954SCole Faust
4167*c217d954SCole Faust
4168*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
4169*c217d954SCole Faust
4170*c217d954SCole Faust
4171*c217d954SCole Faust#define REV1(x) ((x))
4172*c217d954SCole Faust#define REV2(x) ((x).s10)
4173*c217d954SCole Faust#define REV3(x) ((x).s210)
4174*c217d954SCole Faust#define REV4(x) ((x).s3210)
4175*c217d954SCole Faust#define REV8(x) ((x).s76543210)
4176*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
4177*c217d954SCole Faust
4178*c217d954SCole Faust
4179*c217d954SCole Faust
4180*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
4181*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
4182*c217d954SCole Faust
4183*c217d954SCole Faust
4184*c217d954SCole Faust
4185*c217d954SCole Faust#define ROT1_0(x) ((x))
4186*c217d954SCole Faust#define ROT1_1(x) ((x))
4187*c217d954SCole Faust
4188*c217d954SCole Faust#define ROT2_0(x) ((x))
4189*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
4190*c217d954SCole Faust#define ROT2_2(x) ((x))
4191*c217d954SCole Faust
4192*c217d954SCole Faust#define ROT3_0(x) ((x))
4193*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
4194*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
4195*c217d954SCole Faust#define ROT3_3(x) ((x))
4196*c217d954SCole Faust
4197*c217d954SCole Faust#define ROT4_0(x) ((x))
4198*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
4199*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
4200*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
4201*c217d954SCole Faust#define ROT4_4(x) ((x))
4202*c217d954SCole Faust
4203*c217d954SCole Faust#define ROT8_0(x) ((x))
4204*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
4205*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
4206*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
4207*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
4208*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
4209*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
4210*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
4211*c217d954SCole Faust#define ROT8_8(x) ((x))
4212*c217d954SCole Faust
4213*c217d954SCole Faust#define ROT16_0(x) ((x))
4214*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
4215*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
4216*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
4217*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
4218*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
4219*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
4220*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
4221*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
4222*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
4223*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
4224*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
4225*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
4226*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
4227*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
4228*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
4229*c217d954SCole Faust#define ROT16_16(x) ((x))
4230*c217d954SCole Faust
4231*c217d954SCole Faust
4232*c217d954SCole Faust
4233*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
4234*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
4235*c217d954SCole Faust
4236*c217d954SCole Faust
4237*c217d954SCole Faust
4238*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
4239*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
4240*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
4241*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
4242*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
4243*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
4244*c217d954SCole Faust
4245*c217d954SCole Faust
4246*c217d954SCole Faust
4247*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
4248*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
4249*c217d954SCole Faust
4250*c217d954SCole Faust
4251*c217d954SCole Faust#define VLOAD_STR(size) vload##size
4252*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
4253*c217d954SCole Faust
4254*c217d954SCole Faust
4255*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
4256*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
4257*c217d954SCole Faust
4258*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
4259*c217d954SCole Faust    {                            \
4260*c217d954SCole Faust    }
4261*c217d954SCole Faust
4262*c217d954SCole Faust
4263*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
4264*c217d954SCole Faust#define vload_partial_1_1 vload1
4265*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
4266*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
4267*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
4268*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
4269*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
4270*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
4271*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
4272*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
4273*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
4274*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
4275*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
4276*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
4277*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
4278*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
4279*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
4280*c217d954SCole Faust
4281*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
4282*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
4283*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
4284*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
4285*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
4286*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
4287*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
4288*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
4289*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
4290*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
4291*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
4292*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
4293*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
4294*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
4295*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
4296*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
4297*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
4298*c217d954SCole Faust
4299*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
4300*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
4301*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
4302*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
4303*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
4304*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
4305*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
4306*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
4307*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
4308*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
4309*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
4310*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
4311*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
4312*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
4313*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
4314*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
4315*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
4316*c217d954SCole Faust
4317*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
4318*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
4319*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
4320*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
4321*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
4322*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
4323*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
4324*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
4325*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
4326*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
4327*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
4328*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
4329*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
4330*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
4331*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
4332*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
4333*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
4334*c217d954SCole Faust
4335*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
4336*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
4337*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
4338*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
4339*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
4340*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
4341*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
4342*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
4343*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
4344*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
4345*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
4346*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
4347*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
4348*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
4349*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
4350*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
4351*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
4352*c217d954SCole Faust
4353*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
4354*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
4355*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
4356*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
4357*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
4358*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
4359*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
4360*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
4361*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
4362*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
4363*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
4364*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
4365*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
4366*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
4367*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
4368*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
4369*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
4370*c217d954SCole Faust
4371*c217d954SCole Faust
4372*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
4373*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
4374*c217d954SCole Faust
4375*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
4376*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
4377*c217d954SCole Faust
4378*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
4379*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
4380*c217d954SCole Faust
4381*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
4382*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
4383*c217d954SCole Faust
4384*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
4385*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4386*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
4387*c217d954SCole Faust
4388*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
4389*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4390*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
4391*c217d954SCole Faust
4392*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
4393*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4394*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
4395*c217d954SCole Faust
4396*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
4397*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
4398*c217d954SCole Faust
4399*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
4400*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4401*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
4402*c217d954SCole Faust
4403*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
4404*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4405*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
4406*c217d954SCole Faust
4407*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
4408*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4409*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
4410*c217d954SCole Faust
4411*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
4412*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4413*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
4414*c217d954SCole Faust
4415*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
4416*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4417*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
4418*c217d954SCole Faust
4419*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
4420*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4421*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
4422*c217d954SCole Faust
4423*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
4424*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4425*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
4426*c217d954SCole Faust
4427*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
4428*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
4429*c217d954SCole Faust
4430*c217d954SCole Faust
4431*c217d954SCole Faust
4432*c217d954SCole Faust#define PIXEL_UNIT4 1
4433*c217d954SCole Faust#define PIXEL_UNIT8 2
4434*c217d954SCole Faust#define PIXEL_UNIT16 4
4435*c217d954SCole Faust
4436*c217d954SCole Faust
4437*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
4438*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
4439*c217d954SCole Faust
4440*c217d954SCole Faust
4441*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
4442*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
4443*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
4444*c217d954SCole Faust
4445*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4446*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
4447*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
4448*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
4449*c217d954SCole Faust#endif
4450*c217d954SCole Faust
4451*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
4452*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
4453*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4454*c217d954SCole Faust
4455*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4456*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
4457*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
4458*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4459*c217d954SCole Faust#endif
4460*c217d954SCole Faust
4461*c217d954SCole Faust
4462*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
4463*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
4464*c217d954SCole Faust
4465*c217d954SCole Faust
4466*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
4467*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
4468*c217d954SCole Faust
4469*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
4470*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
4471*c217d954SCole Faust
4472*c217d954SCole Faust#define float1 float
4473*c217d954SCole Faust#define half1 half
4474*c217d954SCole Faust#define char1 char
4475*c217d954SCole Faust#define uchar1 uchar
4476*c217d954SCole Faust#define short1 short
4477*c217d954SCole Faust#define ushort1 ushort
4478*c217d954SCole Faust#define int1 int
4479*c217d954SCole Faust#define uint1 uint
4480*c217d954SCole Faust#define long1 long
4481*c217d954SCole Faust#define ulong1 ulong
4482*c217d954SCole Faust#define double1 double
4483*c217d954SCole Faust
4484*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
4485*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
4486*c217d954SCole Faust
4487*c217d954SCole Faust
4488*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
4489*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
4490*c217d954SCole Faust
4491*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
4492*c217d954SCole Faust    {                             \
4493*c217d954SCole Faust    }
4494*c217d954SCole Faust
4495*c217d954SCole Faust
4496*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
4497*c217d954SCole Faust#define vstore_partial_1_1 vstore1
4498*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
4499*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
4500*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
4501*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
4502*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
4503*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
4504*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
4505*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
4506*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
4507*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
4508*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
4509*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
4510*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
4511*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
4512*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
4513*c217d954SCole Faust
4514*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
4515*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
4516*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
4517*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
4518*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
4519*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
4520*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
4521*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
4522*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
4523*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
4524*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
4525*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
4526*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
4527*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
4528*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
4529*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
4530*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
4531*c217d954SCole Faust
4532*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
4533*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
4534*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
4535*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
4536*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
4537*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
4538*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
4539*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
4540*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
4541*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
4542*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
4543*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
4544*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
4545*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
4546*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
4547*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
4548*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
4549*c217d954SCole Faust
4550*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
4551*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
4552*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
4553*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
4554*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
4555*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
4556*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
4557*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
4558*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
4559*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
4560*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
4561*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
4562*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
4563*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
4564*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
4565*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
4566*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
4567*c217d954SCole Faust
4568*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
4569*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
4570*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
4571*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
4572*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
4573*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
4574*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
4575*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
4576*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
4577*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
4578*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
4579*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
4580*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
4581*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
4582*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
4583*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
4584*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
4585*c217d954SCole Faust
4586*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
4587*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
4588*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
4589*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
4590*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
4591*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
4592*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
4593*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
4594*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
4595*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
4596*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
4597*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
4598*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
4599*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
4600*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
4601*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
4602*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
4603*c217d954SCole Faust
4604*c217d954SCole Faust
4605*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
4606*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
4607*c217d954SCole Faust
4608*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
4609*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
4610*c217d954SCole Faust
4611*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
4612*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
4613*c217d954SCole Faust
4614*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
4615*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
4616*c217d954SCole Faust
4617*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
4618*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4619*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
4620*c217d954SCole Faust
4621*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
4622*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4623*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
4624*c217d954SCole Faust
4625*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
4626*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4627*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
4628*c217d954SCole Faust
4629*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
4630*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
4631*c217d954SCole Faust
4632*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
4633*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4634*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
4635*c217d954SCole Faust
4636*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
4637*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4638*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
4639*c217d954SCole Faust
4640*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
4641*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4642*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
4643*c217d954SCole Faust
4644*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
4645*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4646*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
4647*c217d954SCole Faust
4648*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
4649*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4650*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
4651*c217d954SCole Faust
4652*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
4653*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4654*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
4655*c217d954SCole Faust
4656*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
4657*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4658*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
4659*c217d954SCole Faust
4660*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
4661*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
4662*c217d954SCole Faust
4663*c217d954SCole Faust
4664*c217d954SCole Faust
4665*c217d954SCole Faust
4666*c217d954SCole Faust
4667*c217d954SCole Faust#define convert_float_sat convert_float
4668*c217d954SCole Faust#define convert_float1_sat convert_float
4669*c217d954SCole Faust#define convert_float2_sat convert_float2
4670*c217d954SCole Faust#define convert_float3_sat convert_float3
4671*c217d954SCole Faust#define convert_float4_sat convert_float4
4672*c217d954SCole Faust#define convert_float8_sat convert_float8
4673*c217d954SCole Faust#define convert_float16_sat convert_float16
4674*c217d954SCole Faust#define convert_half_sat convert_float
4675*c217d954SCole Faust#define convert_half1_sat convert_half
4676*c217d954SCole Faust#define convert_half2_sat convert_half2
4677*c217d954SCole Faust#define convert_half3_sat convert_half3
4678*c217d954SCole Faust#define convert_half4_sat convert_half4
4679*c217d954SCole Faust#define convert_half8_sat convert_half8
4680*c217d954SCole Faust#define convert_half16_sat convert_half16
4681*c217d954SCole Faust
4682*c217d954SCole Faust#define convert_float1 convert_float
4683*c217d954SCole Faust#define convert_half1 convert_half
4684*c217d954SCole Faust#define convert_char1 convert_char
4685*c217d954SCole Faust#define convert_uchar1 convert_uchar
4686*c217d954SCole Faust#define convert_short1 convert_short
4687*c217d954SCole Faust#define convert_ushort1 convert_ushort
4688*c217d954SCole Faust#define convert_int1 convert_int
4689*c217d954SCole Faust#define convert_uint1 convert_uint
4690*c217d954SCole Faust#define convert_long1 convert_long
4691*c217d954SCole Faust#define convert_ulong1 convert_ulong
4692*c217d954SCole Faust#define convert_double1 convert_double
4693*c217d954SCole Faust
4694*c217d954SCole Faust#define convert_char1_sat convert_char_sat
4695*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
4696*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
4697*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
4698*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
4699*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
4700*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
4701*c217d954SCole Faust#define convert_short1_sat convert_short_sat
4702*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
4703*c217d954SCole Faust#define convert_int1_sat convert_int_sat
4704*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
4705*c217d954SCole Faust#define convert_long1_sat convert_long_sat
4706*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
4707*c217d954SCole Faust#define convert_double1_sat convert_double_sat
4708*c217d954SCole Faust
4709*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
4710*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
4711*c217d954SCole Faust
4712*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
4713*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
4714*c217d954SCole Faust
4715*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
4716*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
4717*c217d954SCole Faust
4718*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
4719*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
4720*c217d954SCole Faust
4721*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
4722*c217d954SCole Faust#define select_vec_dt_char(size) char##size
4723*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
4724*c217d954SCole Faust#define select_vec_dt_short(size) short##size
4725*c217d954SCole Faust#define select_vec_dt_half(size) short##size
4726*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
4727*c217d954SCole Faust#define select_vec_dt_int(size) int##size
4728*c217d954SCole Faust#define select_vec_dt_float(size) int##size
4729*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
4730*c217d954SCole Faust#define select_vec_dt_long(size) long##size
4731*c217d954SCole Faust
4732*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
4733*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
4734*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
4735*c217d954SCole Faust
4736*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
4737*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
4738*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
4739*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
4740*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
4741*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
4742*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
4743*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
4744*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
4745*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
4746*c217d954SCole Faust
4747*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
4748*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
4749*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
4750*c217d954SCole Faust
4751*c217d954SCole Faust#define sum_reduce_1(x) (x)
4752*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
4753*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
4754*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
4755*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
4756*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
4757*c217d954SCole Faust
4758*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
4759*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
4760*c217d954SCole Faust
4761*c217d954SCole Faust#define prod_reduce_1(x) (x)
4762*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
4763*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
4764*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
4765*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
4766*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
4767*c217d954SCole Faust
4768*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
4769*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
4770*c217d954SCole Faust
4771*c217d954SCole Faust#define max_reduce_1(x) (x)
4772*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
4773*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
4774*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
4775*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
4776*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
4777*c217d954SCole Faust
4778*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
4779*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
4780*c217d954SCole Faust
4781*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
4782*c217d954SCole Faust    __global uchar *name##_ptr,      \
4783*c217d954SCole Faust    uint        name##_stride_x, \
4784*c217d954SCole Faust    uint        name##_step_x,   \
4785*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4786*c217d954SCole Faust
4787*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
4788*c217d954SCole Faust    __global uchar *name##_ptr,      \
4789*c217d954SCole Faust    uint        name##_stride_x, \
4790*c217d954SCole Faust    uint        name##_step_x,   \
4791*c217d954SCole Faust    uint        name##_stride_y, \
4792*c217d954SCole Faust    uint        name##_step_y,   \
4793*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4794*c217d954SCole Faust
4795*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
4796*c217d954SCole Faust    __global uchar *name##_ptr,      \
4797*c217d954SCole Faust    uint        name##_stride_x, \
4798*c217d954SCole Faust    uint        name##_step_x,   \
4799*c217d954SCole Faust    uint        name##_stride_y, \
4800*c217d954SCole Faust    uint        name##_step_y,   \
4801*c217d954SCole Faust    uint        name##_stride_z, \
4802*c217d954SCole Faust    uint        name##_step_z,   \
4803*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4804*c217d954SCole Faust
4805*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
4806*c217d954SCole Faust    __global uchar *name##_ptr,      \
4807*c217d954SCole Faust    uint        name##_stride_x, \
4808*c217d954SCole Faust    uint        name##_step_x,   \
4809*c217d954SCole Faust    uint        name##_stride_y, \
4810*c217d954SCole Faust    uint        name##_step_y,   \
4811*c217d954SCole Faust    uint        name##_stride_z, \
4812*c217d954SCole Faust    uint        name##_step_z,   \
4813*c217d954SCole Faust    uint        name##_stride_w, \
4814*c217d954SCole Faust    uint        name##_step_w,   \
4815*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4816*c217d954SCole Faust
4817*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
4818*c217d954SCole Faust    __global uchar *name##_ptr,      \
4819*c217d954SCole Faust    uint        name##_stride_x, \
4820*c217d954SCole Faust    uint        name##_step_x,   \
4821*c217d954SCole Faust    uint        name##_stride_y, \
4822*c217d954SCole Faust    uint        name##_step_y,   \
4823*c217d954SCole Faust    uint        name##_stride_z, \
4824*c217d954SCole Faust    uint        name##_step_z,   \
4825*c217d954SCole Faust    uint        name##_stride_w, \
4826*c217d954SCole Faust    uint        name##_step_w,   \
4827*c217d954SCole Faust    uint        name##_stride_v, \
4828*c217d954SCole Faust    uint        name##_step_v,   \
4829*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4830*c217d954SCole Faust
4831*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
4832*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
4833*c217d954SCole Faust
4834*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
4835*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
4836*c217d954SCole Faust
4837*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
4838*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
4839*c217d954SCole Faust
4840*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
4841*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
4842*c217d954SCole Faust
4843*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4844*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4845*c217d954SCole Faust
4846*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
4847*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
4848*c217d954SCole Faust
4849*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4850*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4851*c217d954SCole Faust
4852*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
4853*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4854*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
4855*c217d954SCole Faust
4856*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
4857*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
4858*c217d954SCole Faust
4859*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
4860*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4861*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
4862*c217d954SCole Faust
4863*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
4864*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
4865*c217d954SCole Faust
4866*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
4867*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4868*c217d954SCole Faust                           name##_stride_z, name##_step_z)
4869*c217d954SCole Faust
4870*c217d954SCole Faust
4871*c217d954SCole Fausttypedef struct Vector
4872*c217d954SCole Faust{
4873*c217d954SCole Faust    __global uchar *ptr;
4874*c217d954SCole Faust    int             offset_first_element_in_bytes;
4875*c217d954SCole Faust    int             stride_x;
4876*c217d954SCole Faust} Vector;
4877*c217d954SCole Faust
4878*c217d954SCole Faust
4879*c217d954SCole Fausttypedef struct Image
4880*c217d954SCole Faust{
4881*c217d954SCole Faust    __global uchar *ptr;
4882*c217d954SCole Faust    int             offset_first_element_in_bytes;
4883*c217d954SCole Faust    int             stride_x;
4884*c217d954SCole Faust    int             stride_y;
4885*c217d954SCole Faust} Image;
4886*c217d954SCole Faust
4887*c217d954SCole Faust
4888*c217d954SCole Fausttypedef struct Tensor3D
4889*c217d954SCole Faust{
4890*c217d954SCole Faust    __global uchar *ptr;
4891*c217d954SCole Faust    int             offset_first_element_in_bytes;
4892*c217d954SCole Faust    int             stride_x;
4893*c217d954SCole Faust    int             stride_y;
4894*c217d954SCole Faust    int             stride_z;
4895*c217d954SCole Faust} Tensor3D;
4896*c217d954SCole Faust
4897*c217d954SCole Faust
4898*c217d954SCole Fausttypedef struct Tensor4D
4899*c217d954SCole Faust{
4900*c217d954SCole Faust    __global uchar *ptr;
4901*c217d954SCole Faust    int             offset_first_element_in_bytes;
4902*c217d954SCole Faust    int             stride_x;
4903*c217d954SCole Faust    int             stride_y;
4904*c217d954SCole Faust    int             stride_z;
4905*c217d954SCole Faust    int             stride_w;
4906*c217d954SCole Faust} Tensor4D;
4907*c217d954SCole Faust
4908*c217d954SCole Faust
4909*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
4910*c217d954SCole Faust{
4911*c217d954SCole Faust    Vector vector =
4912*c217d954SCole Faust    {
4913*c217d954SCole Faust        .ptr                           = ptr,
4914*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4915*c217d954SCole Faust        .stride_x                      = stride_x,
4916*c217d954SCole Faust    };
4917*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
4918*c217d954SCole Faust    return vector;
4919*c217d954SCole Faust}
4920*c217d954SCole Faust
4921*c217d954SCole Faust
4922*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
4923*c217d954SCole Faust{
4924*c217d954SCole Faust    Image img =
4925*c217d954SCole Faust    {
4926*c217d954SCole Faust        .ptr                           = ptr,
4927*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4928*c217d954SCole Faust        .stride_x                      = stride_x,
4929*c217d954SCole Faust        .stride_y                      = stride_y
4930*c217d954SCole Faust    };
4931*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
4932*c217d954SCole Faust    return img;
4933*c217d954SCole Faust}
4934*c217d954SCole Faust
4935*c217d954SCole Faust
4936*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4937*c217d954SCole Faust{
4938*c217d954SCole Faust    Image img =
4939*c217d954SCole Faust    {
4940*c217d954SCole Faust        .ptr                           = ptr,
4941*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4942*c217d954SCole Faust        .stride_x                      = stride_x,
4943*c217d954SCole Faust        .stride_y                      = stride_y
4944*c217d954SCole Faust    };
4945*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4946*c217d954SCole Faust    return img;
4947*c217d954SCole Faust}
4948*c217d954SCole Faust
4949*c217d954SCole Faust
4950*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4951*c217d954SCole Faust{
4952*c217d954SCole Faust    Tensor3D tensor =
4953*c217d954SCole Faust    {
4954*c217d954SCole Faust        .ptr                           = ptr,
4955*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4956*c217d954SCole Faust        .stride_x                      = stride_x,
4957*c217d954SCole Faust        .stride_y                      = stride_y,
4958*c217d954SCole Faust        .stride_z                      = stride_z
4959*c217d954SCole Faust    };
4960*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4961*c217d954SCole Faust    return tensor;
4962*c217d954SCole Faust}
4963*c217d954SCole Faust
4964*c217d954SCole Faust
4965*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4966*c217d954SCole Faust{
4967*c217d954SCole Faust    Tensor3D tensor =
4968*c217d954SCole Faust    {
4969*c217d954SCole Faust        .ptr                           = ptr,
4970*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4971*c217d954SCole Faust        .stride_x                      = stride_x,
4972*c217d954SCole Faust        .stride_y                      = stride_y,
4973*c217d954SCole Faust        .stride_z                      = stride_z
4974*c217d954SCole Faust    };
4975*c217d954SCole Faust    return tensor;
4976*c217d954SCole Faust}
4977*c217d954SCole Faust
4978*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
4979*c217d954SCole Faust                                             uint step_w,
4980*c217d954SCole Faust                                             uint mod_size)
4981*c217d954SCole Faust{
4982*c217d954SCole Faust    Tensor4D tensor =
4983*c217d954SCole Faust    {
4984*c217d954SCole Faust        .ptr                           = ptr,
4985*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4986*c217d954SCole Faust        .stride_x                      = stride_x,
4987*c217d954SCole Faust        .stride_y                      = stride_y,
4988*c217d954SCole Faust        .stride_z                      = stride_z,
4989*c217d954SCole Faust        .stride_w                      = stride_w
4990*c217d954SCole Faust    };
4991*c217d954SCole Faust
4992*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
4993*c217d954SCole Faust    return tensor;
4994*c217d954SCole Faust}
4995*c217d954SCole Faust
4996*c217d954SCole Faust
4997*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
4998*c217d954SCole Faust{
4999*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
5000*c217d954SCole Faust}
5001*c217d954SCole Faust
5002*c217d954SCole Faust
5003*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
5004*c217d954SCole Faust{
5005*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
5006*c217d954SCole Faust}
5007*c217d954SCole Faust
5008*c217d954SCole Faust
5009*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
5010*c217d954SCole Faust{
5011*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
5012*c217d954SCole Faust}
5013*c217d954SCole Faust
5014*c217d954SCole Faust
5015*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
5016*c217d954SCole Faust{
5017*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
5018*c217d954SCole Faust}
5019*c217d954SCole Faust
5020*c217d954SCole Faust
5021*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
5022*c217d954SCole Faust{
5023*c217d954SCole Faust    uint num_elements = width * height;
5024*c217d954SCole Faust
5025*c217d954SCole Faust    const uint z = index / num_elements;
5026*c217d954SCole Faust
5027*c217d954SCole Faust    index %= num_elements;
5028*c217d954SCole Faust
5029*c217d954SCole Faust    const uint y = index / width;
5030*c217d954SCole Faust
5031*c217d954SCole Faust    index %= width;
5032*c217d954SCole Faust
5033*c217d954SCole Faust    const uint x = index;
5034*c217d954SCole Faust
5035*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
5036*c217d954SCole Faust}
5037*c217d954SCole Faust
5038*c217d954SCole Faust#endif
5039*c217d954SCole Faust
5040*c217d954SCole Faust#ifndef ARM_COMPUTE_REPEAT_H
5041*c217d954SCole Faust#define ARM_COMPUTE_REPEAT_H
5042*c217d954SCole Faust
5043*c217d954SCole Faust
5044*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
5045*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
5046*c217d954SCole Faust
5047*c217d954SCole Faust
5048*c217d954SCole Faust
5049*c217d954SCole Faust
5050*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5051*c217d954SCole Faust    VSTORE(N0)                                                 \
5052*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5053*c217d954SCole Faust
5054*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5055*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5056*c217d954SCole Faust    VSTORE(N0)                                                 \
5057*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5058*c217d954SCole Faust
5059*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5060*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5061*c217d954SCole Faust    VSTORE(N0)                                                 \
5062*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5063*c217d954SCole Faust
5064*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5065*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5066*c217d954SCole Faust    VSTORE(N0)                                                 \
5067*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5068*c217d954SCole Faust
5069*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5070*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5071*c217d954SCole Faust    VSTORE(N0)                                                 \
5072*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5073*c217d954SCole Faust
5074*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5075*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5076*c217d954SCole Faust    VSTORE(N0)                                                 \
5077*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5078*c217d954SCole Faust
5079*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5080*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5081*c217d954SCole Faust    VSTORE(N0)                                                 \
5082*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5083*c217d954SCole Faust
5084*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5085*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5086*c217d954SCole Faust    VSTORE(N0)                                                 \
5087*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5088*c217d954SCole Faust
5089*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5090*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5091*c217d954SCole Faust    VSTORE(N0)                                                 \
5092*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5093*c217d954SCole Faust
5094*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5095*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
5096*c217d954SCole Faust    VSTORE(N0)                                                  \
5097*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5098*c217d954SCole Faust
5099*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5100*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5101*c217d954SCole Faust    VSTORE(N0)                                                  \
5102*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5103*c217d954SCole Faust
5104*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5105*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5106*c217d954SCole Faust    VSTORE(N0)                                                  \
5107*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5108*c217d954SCole Faust
5109*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5110*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5111*c217d954SCole Faust    VSTORE(N0)                                                  \
5112*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5113*c217d954SCole Faust
5114*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5115*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5116*c217d954SCole Faust    VSTORE(N0)                                                  \
5117*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5118*c217d954SCole Faust
5119*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5120*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5121*c217d954SCole Faust    VSTORE(N0)                                                  \
5122*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5123*c217d954SCole Faust
5124*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5125*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5126*c217d954SCole Faust    VSTORE(N0)                                                  \
5127*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5128*c217d954SCole Faust
5129*c217d954SCole Faust
5130*c217d954SCole Faust
5131*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5132*c217d954SCole Faust    VSTORE(N0)                                                         \
5133*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5134*c217d954SCole Faust
5135*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5136*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5137*c217d954SCole Faust    VSTORE(N0)                                                         \
5138*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5139*c217d954SCole Faust
5140*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5141*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5142*c217d954SCole Faust    VSTORE(N0)                                                         \
5143*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5144*c217d954SCole Faust
5145*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5146*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5147*c217d954SCole Faust    VSTORE(N0)                                                         \
5148*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5149*c217d954SCole Faust
5150*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5151*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5152*c217d954SCole Faust    VSTORE(N0)                                                         \
5153*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5154*c217d954SCole Faust
5155*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5156*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5157*c217d954SCole Faust    VSTORE(N0)                                                         \
5158*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5159*c217d954SCole Faust
5160*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5161*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5162*c217d954SCole Faust    VSTORE(N0)                                                         \
5163*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5164*c217d954SCole Faust
5165*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5166*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5167*c217d954SCole Faust    VSTORE(N0)                                                         \
5168*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5169*c217d954SCole Faust
5170*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5171*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5172*c217d954SCole Faust    VSTORE(N0)                                                         \
5173*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5174*c217d954SCole Faust
5175*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
5176*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5177*c217d954SCole Faust    VSTORE(N0)                                                     \
5178*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5179*c217d954SCole Faust
5180*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5181*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5182*c217d954SCole Faust    VSTORE(N0)                                                          \
5183*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5184*c217d954SCole Faust
5185*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5186*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5187*c217d954SCole Faust    VSTORE(N0)                                                          \
5188*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5189*c217d954SCole Faust
5190*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5191*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5192*c217d954SCole Faust    VSTORE(N0)                                                          \
5193*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5194*c217d954SCole Faust
5195*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5196*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5197*c217d954SCole Faust    VSTORE(N0)                                                          \
5198*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5199*c217d954SCole Faust
5200*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5201*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5202*c217d954SCole Faust    VSTORE(N0)                                                          \
5203*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5204*c217d954SCole Faust
5205*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5206*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5207*c217d954SCole Faust    VSTORE(N0)                                                          \
5208*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5209*c217d954SCole Faust
5210*c217d954SCole Faust
5211*c217d954SCole Faust
5212*c217d954SCole Faust
5213*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5214*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5215*c217d954SCole Faust
5216*c217d954SCole Faust
5217*c217d954SCole Faust
5218*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5219*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5220*c217d954SCole Faust
5221*c217d954SCole Faust
5222*c217d954SCole Faust
5223*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5224*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5225*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5226*c217d954SCole Faust
5227*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5228*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5229*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5230*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5231*c217d954SCole Faust
5232*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5233*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5234*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5235*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5236*c217d954SCole Faust
5237*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5238*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5239*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5240*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5241*c217d954SCole Faust
5242*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5243*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5244*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5245*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5246*c217d954SCole Faust
5247*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5248*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5249*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5250*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5251*c217d954SCole Faust
5252*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5253*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5254*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5255*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5256*c217d954SCole Faust
5257*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5258*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5259*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5260*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5261*c217d954SCole Faust
5262*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5263*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5264*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5265*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5266*c217d954SCole Faust
5267*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5268*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
5269*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5270*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5271*c217d954SCole Faust
5272*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5273*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5274*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5275*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5276*c217d954SCole Faust
5277*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5278*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5279*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5280*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5281*c217d954SCole Faust
5282*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5283*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5284*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5285*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5286*c217d954SCole Faust
5287*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5288*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5289*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5290*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5291*c217d954SCole Faust
5292*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5293*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5294*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5295*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5296*c217d954SCole Faust
5297*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5298*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5299*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5300*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5301*c217d954SCole Faust
5302*c217d954SCole Faust
5303*c217d954SCole Faust
5304*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5305*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5306*c217d954SCole Faust
5307*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5308*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
5309*c217d954SCole Faust    {                                                                                                                                                     \
5310*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
5311*c217d954SCole Faust    }                                                                                                                                                     \
5312*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
5313*c217d954SCole Faust    {                                                                                                                                                     \
5314*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
5315*c217d954SCole Faust    }                                                                                                                                                     \
5316*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
5317*c217d954SCole Faust    {                                                                                                                                                     \
5318*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
5319*c217d954SCole Faust    }                                                                                                                                                     \
5320*c217d954SCole Faust    else                                                                                                                                                  \
5321*c217d954SCole Faust    {                                                                                                                                                     \
5322*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
5323*c217d954SCole Faust    }
5324*c217d954SCole Faust
5325*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
5326*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
5327*c217d954SCole Faust    {                                                                                                             \
5328*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
5329*c217d954SCole Faust    }                                                                                                             \
5330*c217d954SCole Faust    else                                                                                                          \
5331*c217d954SCole Faust    {                                                                                                             \
5332*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
5333*c217d954SCole Faust    }
5334*c217d954SCole Faust
5335*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
5336*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
5337*c217d954SCole Faust    {                                                                                                             \
5338*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
5339*c217d954SCole Faust    }                                                                                                             \
5340*c217d954SCole Faust    else                                                                                                          \
5341*c217d954SCole Faust    {                                                                                                             \
5342*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
5343*c217d954SCole Faust    }
5344*c217d954SCole Faust
5345*c217d954SCole Faust
5346*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
5347*c217d954SCole Faust
5348*c217d954SCole Faust
5349*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
5350*c217d954SCole Faust
5351*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5352*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5353*c217d954SCole Faust
5354*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
5355*c217d954SCole Faust
5356*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5357*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
5358*c217d954SCole Faust
5359*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
5360*c217d954SCole Faust
5361*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5362*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
5363*c217d954SCole Faust
5364*c217d954SCole Faust#else
5365*c217d954SCole Faust
5366*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5367*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
5368*c217d954SCole Faust
5369*c217d954SCole Faust#endif
5370*c217d954SCole Faust
5371*c217d954SCole Faust#endif
5372*c217d954SCole Faust
5373*c217d954SCole Faust
5374*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
5375*c217d954SCole Faust
5376*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
5377*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
5378*c217d954SCole Faust#else
5379*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
5380*c217d954SCole Faust    ((uint)(y * M0))
5381*c217d954SCole Faust#endif
5382*c217d954SCole Faust
5383*c217d954SCole Faust
5384*c217d954SCole Faust
5385*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
5386*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
5387*c217d954SCole Faust
5388*c217d954SCole Faust
5389*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
5390*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
5391*c217d954SCole Faust#endif
5392*c217d954SCole Faust
5393*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
5394*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
5395*c217d954SCole Faust#endif
5396*c217d954SCole Faust
5397*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
5398*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
5399*c217d954SCole Faust#endif
5400*c217d954SCole Faust
5401*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
5402*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
5403*c217d954SCole Faust#endif
5404*c217d954SCole Faust
5405*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
5406*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
5407*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
5408*c217d954SCole Faust
5409*c217d954SCole Faust
5410*c217d954SCole Faust#define CONCAT(a, b) a##b
5411*c217d954SCole Faust
5412*c217d954SCole Faust
5413*c217d954SCole Faust#define EXPAND(x) x
5414*c217d954SCole Faust
5415*c217d954SCole Faust
5416*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
5417*c217d954SCole Faust
5418*c217d954SCole Faust
5419*c217d954SCole Faust#define REV1(x) ((x))
5420*c217d954SCole Faust#define REV2(x) ((x).s10)
5421*c217d954SCole Faust#define REV3(x) ((x).s210)
5422*c217d954SCole Faust#define REV4(x) ((x).s3210)
5423*c217d954SCole Faust#define REV8(x) ((x).s76543210)
5424*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
5425*c217d954SCole Faust
5426*c217d954SCole Faust
5427*c217d954SCole Faust
5428*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
5429*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
5430*c217d954SCole Faust
5431*c217d954SCole Faust
5432*c217d954SCole Faust
5433*c217d954SCole Faust#define ROT1_0(x) ((x))
5434*c217d954SCole Faust#define ROT1_1(x) ((x))
5435*c217d954SCole Faust
5436*c217d954SCole Faust#define ROT2_0(x) ((x))
5437*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
5438*c217d954SCole Faust#define ROT2_2(x) ((x))
5439*c217d954SCole Faust
5440*c217d954SCole Faust#define ROT3_0(x) ((x))
5441*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
5442*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
5443*c217d954SCole Faust#define ROT3_3(x) ((x))
5444*c217d954SCole Faust
5445*c217d954SCole Faust#define ROT4_0(x) ((x))
5446*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
5447*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
5448*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
5449*c217d954SCole Faust#define ROT4_4(x) ((x))
5450*c217d954SCole Faust
5451*c217d954SCole Faust#define ROT8_0(x) ((x))
5452*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
5453*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
5454*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
5455*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
5456*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
5457*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
5458*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
5459*c217d954SCole Faust#define ROT8_8(x) ((x))
5460*c217d954SCole Faust
5461*c217d954SCole Faust#define ROT16_0(x) ((x))
5462*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
5463*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
5464*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
5465*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
5466*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
5467*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
5468*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
5469*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
5470*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
5471*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
5472*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
5473*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
5474*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
5475*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
5476*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
5477*c217d954SCole Faust#define ROT16_16(x) ((x))
5478*c217d954SCole Faust
5479*c217d954SCole Faust
5480*c217d954SCole Faust
5481*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
5482*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
5483*c217d954SCole Faust
5484*c217d954SCole Faust
5485*c217d954SCole Faust
5486*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
5487*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
5488*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
5489*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
5490*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
5491*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
5492*c217d954SCole Faust
5493*c217d954SCole Faust
5494*c217d954SCole Faust
5495*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
5496*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
5497*c217d954SCole Faust
5498*c217d954SCole Faust
5499*c217d954SCole Faust#define VLOAD_STR(size) vload##size
5500*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
5501*c217d954SCole Faust
5502*c217d954SCole Faust
5503*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
5504*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
5505*c217d954SCole Faust
5506*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
5507*c217d954SCole Faust    {                            \
5508*c217d954SCole Faust    }
5509*c217d954SCole Faust
5510*c217d954SCole Faust
5511*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
5512*c217d954SCole Faust#define vload_partial_1_1 vload1
5513*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
5514*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
5515*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
5516*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
5517*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
5518*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
5519*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
5520*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
5521*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
5522*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
5523*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
5524*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
5525*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
5526*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
5527*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
5528*c217d954SCole Faust
5529*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
5530*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
5531*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
5532*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
5533*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
5534*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
5535*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
5536*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
5537*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
5538*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
5539*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
5540*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
5541*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
5542*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
5543*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
5544*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
5545*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
5546*c217d954SCole Faust
5547*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
5548*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
5549*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
5550*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
5551*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
5552*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
5553*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
5554*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
5555*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
5556*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
5557*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
5558*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
5559*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
5560*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
5561*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
5562*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
5563*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
5564*c217d954SCole Faust
5565*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
5566*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
5567*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
5568*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
5569*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
5570*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
5571*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
5572*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
5573*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
5574*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
5575*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
5576*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
5577*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
5578*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
5579*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
5580*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
5581*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
5582*c217d954SCole Faust
5583*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
5584*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
5585*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
5586*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
5587*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
5588*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
5589*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
5590*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
5591*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
5592*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
5593*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
5594*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
5595*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
5596*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
5597*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
5598*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
5599*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
5600*c217d954SCole Faust
5601*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
5602*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
5603*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
5604*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
5605*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
5606*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
5607*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
5608*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
5609*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
5610*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
5611*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
5612*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
5613*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
5614*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
5615*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
5616*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
5617*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
5618*c217d954SCole Faust
5619*c217d954SCole Faust
5620*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
5621*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
5622*c217d954SCole Faust
5623*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
5624*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
5625*c217d954SCole Faust
5626*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
5627*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
5628*c217d954SCole Faust
5629*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
5630*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
5631*c217d954SCole Faust
5632*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
5633*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
5634*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
5635*c217d954SCole Faust
5636*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
5637*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
5638*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
5639*c217d954SCole Faust
5640*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
5641*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
5642*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
5643*c217d954SCole Faust
5644*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
5645*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
5646*c217d954SCole Faust
5647*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
5648*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5649*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
5650*c217d954SCole Faust
5651*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
5652*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5653*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
5654*c217d954SCole Faust
5655*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
5656*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5657*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
5658*c217d954SCole Faust
5659*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
5660*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5661*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
5662*c217d954SCole Faust
5663*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
5664*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5665*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
5666*c217d954SCole Faust
5667*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
5668*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5669*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
5670*c217d954SCole Faust
5671*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
5672*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
5673*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
5674*c217d954SCole Faust
5675*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
5676*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
5677*c217d954SCole Faust
5678*c217d954SCole Faust
5679*c217d954SCole Faust
5680*c217d954SCole Faust#define PIXEL_UNIT4 1
5681*c217d954SCole Faust#define PIXEL_UNIT8 2
5682*c217d954SCole Faust#define PIXEL_UNIT16 4
5683*c217d954SCole Faust
5684*c217d954SCole Faust
5685*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
5686*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
5687*c217d954SCole Faust
5688*c217d954SCole Faust
5689*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
5690*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
5691*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
5692*c217d954SCole Faust
5693*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
5694*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
5695*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
5696*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
5697*c217d954SCole Faust#endif
5698*c217d954SCole Faust
5699*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
5700*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
5701*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
5702*c217d954SCole Faust
5703*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
5704*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
5705*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
5706*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
5707*c217d954SCole Faust#endif
5708*c217d954SCole Faust
5709*c217d954SCole Faust
5710*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
5711*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
5712*c217d954SCole Faust
5713*c217d954SCole Faust
5714*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
5715*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
5716*c217d954SCole Faust
5717*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
5718*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
5719*c217d954SCole Faust
5720*c217d954SCole Faust#define float1 float
5721*c217d954SCole Faust#define half1 half
5722*c217d954SCole Faust#define char1 char
5723*c217d954SCole Faust#define uchar1 uchar
5724*c217d954SCole Faust#define short1 short
5725*c217d954SCole Faust#define ushort1 ushort
5726*c217d954SCole Faust#define int1 int
5727*c217d954SCole Faust#define uint1 uint
5728*c217d954SCole Faust#define long1 long
5729*c217d954SCole Faust#define ulong1 ulong
5730*c217d954SCole Faust#define double1 double
5731*c217d954SCole Faust
5732*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
5733*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
5734*c217d954SCole Faust
5735*c217d954SCole Faust
5736*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
5737*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
5738*c217d954SCole Faust
5739*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
5740*c217d954SCole Faust    {                             \
5741*c217d954SCole Faust    }
5742*c217d954SCole Faust
5743*c217d954SCole Faust
5744*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
5745*c217d954SCole Faust#define vstore_partial_1_1 vstore1
5746*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
5747*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
5748*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
5749*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
5750*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
5751*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
5752*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
5753*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
5754*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
5755*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
5756*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
5757*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
5758*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
5759*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
5760*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
5761*c217d954SCole Faust
5762*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
5763*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
5764*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
5765*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
5766*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
5767*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
5768*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
5769*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
5770*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
5771*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
5772*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
5773*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
5774*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
5775*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
5776*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
5777*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
5778*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
5779*c217d954SCole Faust
5780*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
5781*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
5782*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
5783*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
5784*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
5785*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
5786*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
5787*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
5788*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
5789*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
5790*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
5791*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
5792*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
5793*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
5794*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
5795*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
5796*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
5797*c217d954SCole Faust
5798*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
5799*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
5800*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
5801*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
5802*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
5803*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
5804*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
5805*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
5806*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
5807*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
5808*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
5809*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
5810*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
5811*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
5812*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
5813*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
5814*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
5815*c217d954SCole Faust
5816*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
5817*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
5818*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
5819*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
5820*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
5821*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
5822*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
5823*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
5824*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
5825*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
5826*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
5827*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
5828*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
5829*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
5830*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
5831*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
5832*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
5833*c217d954SCole Faust
5834*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
5835*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
5836*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
5837*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
5838*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
5839*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
5840*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
5841*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
5842*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
5843*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
5844*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
5845*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
5846*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
5847*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
5848*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
5849*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
5850*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
5851*c217d954SCole Faust
5852*c217d954SCole Faust
5853*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
5854*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
5855*c217d954SCole Faust
5856*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
5857*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
5858*c217d954SCole Faust
5859*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
5860*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
5861*c217d954SCole Faust
5862*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
5863*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
5864*c217d954SCole Faust
5865*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
5866*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
5867*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
5868*c217d954SCole Faust
5869*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
5870*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
5871*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
5872*c217d954SCole Faust
5873*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
5874*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
5875*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
5876*c217d954SCole Faust
5877*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
5878*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
5879*c217d954SCole Faust
5880*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
5881*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5882*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
5883*c217d954SCole Faust
5884*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
5885*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5886*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
5887*c217d954SCole Faust
5888*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
5889*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5890*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
5891*c217d954SCole Faust
5892*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
5893*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5894*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
5895*c217d954SCole Faust
5896*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
5897*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5898*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
5899*c217d954SCole Faust
5900*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
5901*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5902*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
5903*c217d954SCole Faust
5904*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
5905*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
5906*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
5907*c217d954SCole Faust
5908*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
5909*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
5910*c217d954SCole Faust
5911*c217d954SCole Faust
5912*c217d954SCole Faust
5913*c217d954SCole Faust
5914*c217d954SCole Faust
5915*c217d954SCole Faust#define convert_float_sat convert_float
5916*c217d954SCole Faust#define convert_float1_sat convert_float
5917*c217d954SCole Faust#define convert_float2_sat convert_float2
5918*c217d954SCole Faust#define convert_float3_sat convert_float3
5919*c217d954SCole Faust#define convert_float4_sat convert_float4
5920*c217d954SCole Faust#define convert_float8_sat convert_float8
5921*c217d954SCole Faust#define convert_float16_sat convert_float16
5922*c217d954SCole Faust#define convert_half_sat convert_float
5923*c217d954SCole Faust#define convert_half1_sat convert_half
5924*c217d954SCole Faust#define convert_half2_sat convert_half2
5925*c217d954SCole Faust#define convert_half3_sat convert_half3
5926*c217d954SCole Faust#define convert_half4_sat convert_half4
5927*c217d954SCole Faust#define convert_half8_sat convert_half8
5928*c217d954SCole Faust#define convert_half16_sat convert_half16
5929*c217d954SCole Faust
5930*c217d954SCole Faust#define convert_float1 convert_float
5931*c217d954SCole Faust#define convert_half1 convert_half
5932*c217d954SCole Faust#define convert_char1 convert_char
5933*c217d954SCole Faust#define convert_uchar1 convert_uchar
5934*c217d954SCole Faust#define convert_short1 convert_short
5935*c217d954SCole Faust#define convert_ushort1 convert_ushort
5936*c217d954SCole Faust#define convert_int1 convert_int
5937*c217d954SCole Faust#define convert_uint1 convert_uint
5938*c217d954SCole Faust#define convert_long1 convert_long
5939*c217d954SCole Faust#define convert_ulong1 convert_ulong
5940*c217d954SCole Faust#define convert_double1 convert_double
5941*c217d954SCole Faust
5942*c217d954SCole Faust#define convert_char1_sat convert_char_sat
5943*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
5944*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
5945*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
5946*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
5947*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
5948*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
5949*c217d954SCole Faust#define convert_short1_sat convert_short_sat
5950*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
5951*c217d954SCole Faust#define convert_int1_sat convert_int_sat
5952*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
5953*c217d954SCole Faust#define convert_long1_sat convert_long_sat
5954*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
5955*c217d954SCole Faust#define convert_double1_sat convert_double_sat
5956*c217d954SCole Faust
5957*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
5958*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
5959*c217d954SCole Faust
5960*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
5961*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
5962*c217d954SCole Faust
5963*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
5964*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
5965*c217d954SCole Faust
5966*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
5967*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
5968*c217d954SCole Faust
5969*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
5970*c217d954SCole Faust#define select_vec_dt_char(size) char##size
5971*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
5972*c217d954SCole Faust#define select_vec_dt_short(size) short##size
5973*c217d954SCole Faust#define select_vec_dt_half(size) short##size
5974*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
5975*c217d954SCole Faust#define select_vec_dt_int(size) int##size
5976*c217d954SCole Faust#define select_vec_dt_float(size) int##size
5977*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
5978*c217d954SCole Faust#define select_vec_dt_long(size) long##size
5979*c217d954SCole Faust
5980*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
5981*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
5982*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
5983*c217d954SCole Faust
5984*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
5985*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
5986*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
5987*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
5988*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
5989*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
5990*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
5991*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
5992*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
5993*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
5994*c217d954SCole Faust
5995*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
5996*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
5997*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
5998*c217d954SCole Faust
5999*c217d954SCole Faust#define sum_reduce_1(x) (x)
6000*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
6001*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
6002*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
6003*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
6004*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
6005*c217d954SCole Faust
6006*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
6007*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
6008*c217d954SCole Faust
6009*c217d954SCole Faust#define prod_reduce_1(x) (x)
6010*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
6011*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
6012*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
6013*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
6014*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
6015*c217d954SCole Faust
6016*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
6017*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
6018*c217d954SCole Faust
6019*c217d954SCole Faust#define max_reduce_1(x) (x)
6020*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
6021*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
6022*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
6023*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
6024*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
6025*c217d954SCole Faust
6026*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
6027*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
6028*c217d954SCole Faust
6029*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
6030*c217d954SCole Faust    __global uchar *name##_ptr,      \
6031*c217d954SCole Faust    uint        name##_stride_x, \
6032*c217d954SCole Faust    uint        name##_step_x,   \
6033*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6034*c217d954SCole Faust
6035*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
6036*c217d954SCole Faust    __global uchar *name##_ptr,      \
6037*c217d954SCole Faust    uint        name##_stride_x, \
6038*c217d954SCole Faust    uint        name##_step_x,   \
6039*c217d954SCole Faust    uint        name##_stride_y, \
6040*c217d954SCole Faust    uint        name##_step_y,   \
6041*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6042*c217d954SCole Faust
6043*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
6044*c217d954SCole Faust    __global uchar *name##_ptr,      \
6045*c217d954SCole Faust    uint        name##_stride_x, \
6046*c217d954SCole Faust    uint        name##_step_x,   \
6047*c217d954SCole Faust    uint        name##_stride_y, \
6048*c217d954SCole Faust    uint        name##_step_y,   \
6049*c217d954SCole Faust    uint        name##_stride_z, \
6050*c217d954SCole Faust    uint        name##_step_z,   \
6051*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6052*c217d954SCole Faust
6053*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
6054*c217d954SCole Faust    __global uchar *name##_ptr,      \
6055*c217d954SCole Faust    uint        name##_stride_x, \
6056*c217d954SCole Faust    uint        name##_step_x,   \
6057*c217d954SCole Faust    uint        name##_stride_y, \
6058*c217d954SCole Faust    uint        name##_step_y,   \
6059*c217d954SCole Faust    uint        name##_stride_z, \
6060*c217d954SCole Faust    uint        name##_step_z,   \
6061*c217d954SCole Faust    uint        name##_stride_w, \
6062*c217d954SCole Faust    uint        name##_step_w,   \
6063*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6064*c217d954SCole Faust
6065*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
6066*c217d954SCole Faust    __global uchar *name##_ptr,      \
6067*c217d954SCole Faust    uint        name##_stride_x, \
6068*c217d954SCole Faust    uint        name##_step_x,   \
6069*c217d954SCole Faust    uint        name##_stride_y, \
6070*c217d954SCole Faust    uint        name##_step_y,   \
6071*c217d954SCole Faust    uint        name##_stride_z, \
6072*c217d954SCole Faust    uint        name##_step_z,   \
6073*c217d954SCole Faust    uint        name##_stride_w, \
6074*c217d954SCole Faust    uint        name##_step_w,   \
6075*c217d954SCole Faust    uint        name##_stride_v, \
6076*c217d954SCole Faust    uint        name##_step_v,   \
6077*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6078*c217d954SCole Faust
6079*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
6080*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
6081*c217d954SCole Faust
6082*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
6083*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
6084*c217d954SCole Faust
6085*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
6086*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
6087*c217d954SCole Faust
6088*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
6089*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
6090*c217d954SCole Faust
6091*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
6092*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
6093*c217d954SCole Faust
6094*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
6095*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
6096*c217d954SCole Faust
6097*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
6098*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
6099*c217d954SCole Faust
6100*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
6101*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6102*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
6103*c217d954SCole Faust
6104*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
6105*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
6106*c217d954SCole Faust
6107*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
6108*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6109*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
6110*c217d954SCole Faust
6111*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
6112*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
6113*c217d954SCole Faust
6114*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
6115*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6116*c217d954SCole Faust                           name##_stride_z, name##_step_z)
6117*c217d954SCole Faust
6118*c217d954SCole Faust
6119*c217d954SCole Fausttypedef struct Vector
6120*c217d954SCole Faust{
6121*c217d954SCole Faust    __global uchar *ptr;
6122*c217d954SCole Faust    int             offset_first_element_in_bytes;
6123*c217d954SCole Faust    int             stride_x;
6124*c217d954SCole Faust} Vector;
6125*c217d954SCole Faust
6126*c217d954SCole Faust
6127*c217d954SCole Fausttypedef struct Image
6128*c217d954SCole Faust{
6129*c217d954SCole Faust    __global uchar *ptr;
6130*c217d954SCole Faust    int             offset_first_element_in_bytes;
6131*c217d954SCole Faust    int             stride_x;
6132*c217d954SCole Faust    int             stride_y;
6133*c217d954SCole Faust} Image;
6134*c217d954SCole Faust
6135*c217d954SCole Faust
6136*c217d954SCole Fausttypedef struct Tensor3D
6137*c217d954SCole Faust{
6138*c217d954SCole Faust    __global uchar *ptr;
6139*c217d954SCole Faust    int             offset_first_element_in_bytes;
6140*c217d954SCole Faust    int             stride_x;
6141*c217d954SCole Faust    int             stride_y;
6142*c217d954SCole Faust    int             stride_z;
6143*c217d954SCole Faust} Tensor3D;
6144*c217d954SCole Faust
6145*c217d954SCole Faust
6146*c217d954SCole Fausttypedef struct Tensor4D
6147*c217d954SCole Faust{
6148*c217d954SCole Faust    __global uchar *ptr;
6149*c217d954SCole Faust    int             offset_first_element_in_bytes;
6150*c217d954SCole Faust    int             stride_x;
6151*c217d954SCole Faust    int             stride_y;
6152*c217d954SCole Faust    int             stride_z;
6153*c217d954SCole Faust    int             stride_w;
6154*c217d954SCole Faust} Tensor4D;
6155*c217d954SCole Faust
6156*c217d954SCole Faust
6157*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
6158*c217d954SCole Faust{
6159*c217d954SCole Faust    Vector vector =
6160*c217d954SCole Faust    {
6161*c217d954SCole Faust        .ptr                           = ptr,
6162*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6163*c217d954SCole Faust        .stride_x                      = stride_x,
6164*c217d954SCole Faust    };
6165*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
6166*c217d954SCole Faust    return vector;
6167*c217d954SCole Faust}
6168*c217d954SCole Faust
6169*c217d954SCole Faust
6170*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
6171*c217d954SCole Faust{
6172*c217d954SCole Faust    Image img =
6173*c217d954SCole Faust    {
6174*c217d954SCole Faust        .ptr                           = ptr,
6175*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6176*c217d954SCole Faust        .stride_x                      = stride_x,
6177*c217d954SCole Faust        .stride_y                      = stride_y
6178*c217d954SCole Faust    };
6179*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
6180*c217d954SCole Faust    return img;
6181*c217d954SCole Faust}
6182*c217d954SCole Faust
6183*c217d954SCole Faust
6184*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6185*c217d954SCole Faust{
6186*c217d954SCole Faust    Image img =
6187*c217d954SCole Faust    {
6188*c217d954SCole Faust        .ptr                           = ptr,
6189*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6190*c217d954SCole Faust        .stride_x                      = stride_x,
6191*c217d954SCole Faust        .stride_y                      = stride_y
6192*c217d954SCole Faust    };
6193*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
6194*c217d954SCole Faust    return img;
6195*c217d954SCole Faust}
6196*c217d954SCole Faust
6197*c217d954SCole Faust
6198*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6199*c217d954SCole Faust{
6200*c217d954SCole Faust    Tensor3D tensor =
6201*c217d954SCole Faust    {
6202*c217d954SCole Faust        .ptr                           = ptr,
6203*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6204*c217d954SCole Faust        .stride_x                      = stride_x,
6205*c217d954SCole Faust        .stride_y                      = stride_y,
6206*c217d954SCole Faust        .stride_z                      = stride_z
6207*c217d954SCole Faust    };
6208*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
6209*c217d954SCole Faust    return tensor;
6210*c217d954SCole Faust}
6211*c217d954SCole Faust
6212*c217d954SCole Faust
6213*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6214*c217d954SCole Faust{
6215*c217d954SCole Faust    Tensor3D tensor =
6216*c217d954SCole Faust    {
6217*c217d954SCole Faust        .ptr                           = ptr,
6218*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6219*c217d954SCole Faust        .stride_x                      = stride_x,
6220*c217d954SCole Faust        .stride_y                      = stride_y,
6221*c217d954SCole Faust        .stride_z                      = stride_z
6222*c217d954SCole Faust    };
6223*c217d954SCole Faust    return tensor;
6224*c217d954SCole Faust}
6225*c217d954SCole Faust
6226*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
6227*c217d954SCole Faust                                             uint step_w,
6228*c217d954SCole Faust                                             uint mod_size)
6229*c217d954SCole Faust{
6230*c217d954SCole Faust    Tensor4D tensor =
6231*c217d954SCole Faust    {
6232*c217d954SCole Faust        .ptr                           = ptr,
6233*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6234*c217d954SCole Faust        .stride_x                      = stride_x,
6235*c217d954SCole Faust        .stride_y                      = stride_y,
6236*c217d954SCole Faust        .stride_z                      = stride_z,
6237*c217d954SCole Faust        .stride_w                      = stride_w
6238*c217d954SCole Faust    };
6239*c217d954SCole Faust
6240*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
6241*c217d954SCole Faust    return tensor;
6242*c217d954SCole Faust}
6243*c217d954SCole Faust
6244*c217d954SCole Faust
6245*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
6246*c217d954SCole Faust{
6247*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
6248*c217d954SCole Faust}
6249*c217d954SCole Faust
6250*c217d954SCole Faust
6251*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
6252*c217d954SCole Faust{
6253*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
6254*c217d954SCole Faust}
6255*c217d954SCole Faust
6256*c217d954SCole Faust
6257*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
6258*c217d954SCole Faust{
6259*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
6260*c217d954SCole Faust}
6261*c217d954SCole Faust
6262*c217d954SCole Faust
6263*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
6264*c217d954SCole Faust{
6265*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
6266*c217d954SCole Faust}
6267*c217d954SCole Faust
6268*c217d954SCole Faust
6269*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
6270*c217d954SCole Faust{
6271*c217d954SCole Faust    uint num_elements = width * height;
6272*c217d954SCole Faust
6273*c217d954SCole Faust    const uint z = index / num_elements;
6274*c217d954SCole Faust
6275*c217d954SCole Faust    index %= num_elements;
6276*c217d954SCole Faust
6277*c217d954SCole Faust    const uint y = index / width;
6278*c217d954SCole Faust
6279*c217d954SCole Faust    index %= width;
6280*c217d954SCole Faust
6281*c217d954SCole Faust    const uint x = index;
6282*c217d954SCole Faust
6283*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
6284*c217d954SCole Faust}
6285*c217d954SCole Faust
6286*c217d954SCole Faust#endif
6287*c217d954SCole Faust
6288*c217d954SCole Faust
6289*c217d954SCole Faust
6290*c217d954SCole Faust#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
6291*c217d954SCole Faust#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
6292*c217d954SCole Faust    P_X##_DEF(1, P_A, P_B, P_C);       \
6293*c217d954SCole Faust    REPEAT_3_1(P_X, P_A, P_B, P_C)
6294*c217d954SCole Faust#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
6295*c217d954SCole Faust    P_X##_DEF(2, P_A, P_B, P_C);       \
6296*c217d954SCole Faust    REPEAT_3_2(P_X, P_A, P_B, P_C)
6297*c217d954SCole Faust#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
6298*c217d954SCole Faust    P_X##_DEF(3, P_A, P_B, P_C);       \
6299*c217d954SCole Faust    REPEAT_3_3(P_X, P_A, P_B, P_C)
6300*c217d954SCole Faust#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
6301*c217d954SCole Faust    P_X##_DEF(4, P_A, P_B, P_C);       \
6302*c217d954SCole Faust    REPEAT_3_4(P_X, P_A, P_B, P_C)
6303*c217d954SCole Faust#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
6304*c217d954SCole Faust    P_X##_DEF(5, P_A, P_B, P_C);       \
6305*c217d954SCole Faust    REPEAT_3_5(P_X, P_A, P_B, P_C)
6306*c217d954SCole Faust#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
6307*c217d954SCole Faust    P_X##_DEF(6, P_A, P_B, P_C);       \
6308*c217d954SCole Faust    REPEAT_3_6(P_X, P_A, P_B, P_C)
6309*c217d954SCole Faust#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
6310*c217d954SCole Faust    P_X##_DEF(7, P_A, P_B, P_C);       \
6311*c217d954SCole Faust    REPEAT_3_7(P_X, P_A, P_B, P_C)
6312*c217d954SCole Faust#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
6313*c217d954SCole Faust    P_X##_DEF(8, P_A, P_B, P_C);       \
6314*c217d954SCole Faust    REPEAT_3_8(P_X, P_A, P_B, P_C)
6315*c217d954SCole Faust#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
6316*c217d954SCole Faust    P_X##_DEF(9, P_A, P_B, P_C);        \
6317*c217d954SCole Faust    REPEAT_3_9(P_X, P_A, P_B, P_C)
6318*c217d954SCole Faust#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
6319*c217d954SCole Faust    P_X##_DEF(A, P_A, P_B, P_C);        \
6320*c217d954SCole Faust    REPEAT_3_10(P_X, P_A, P_B, P_C)
6321*c217d954SCole Faust#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
6322*c217d954SCole Faust    P_X##_DEF(B, P_A, P_B, P_C);        \
6323*c217d954SCole Faust    REPEAT_3_11(P_X, P_A, P_B, P_C)
6324*c217d954SCole Faust#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
6325*c217d954SCole Faust    P_X##_DEF(C, P_A, P_B, P_C);        \
6326*c217d954SCole Faust    REPEAT_3_12(P_X, P_A, P_B, P_C)
6327*c217d954SCole Faust#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
6328*c217d954SCole Faust    P_X##_DEF(D, P_A, P_B, P_C);        \
6329*c217d954SCole Faust    REPEAT_3_13(P_X, P_A, P_B, P_C)
6330*c217d954SCole Faust#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
6331*c217d954SCole Faust    P_X##_DEF(E, P_A, P_B, P_C);        \
6332*c217d954SCole Faust    REPEAT_3_14(P_X, P_A, P_B, P_C)
6333*c217d954SCole Faust#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
6334*c217d954SCole Faust    P_X##_DEF(F, P_A, P_B, P_C);        \
6335*c217d954SCole Faust    REPEAT_3_15(P_X, P_A, P_B, P_C)
6336*c217d954SCole Faust
6337*c217d954SCole Faust#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C)
6338*c217d954SCole Faust#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
6339*c217d954SCole Faust
6340*c217d954SCole Faust
6341*c217d954SCole Faust#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D)
6342*c217d954SCole Faust#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \
6343*c217d954SCole Faust    P_X##_DEF(1, P_A, P_B, P_C, P_D);       \
6344*c217d954SCole Faust    REPEAT_4_1(P_X, P_A, P_B, P_C, P_D)
6345*c217d954SCole Faust#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \
6346*c217d954SCole Faust    P_X##_DEF(2, P_A, P_B, P_C, P_D);       \
6347*c217d954SCole Faust    REPEAT_4_2(P_X, P_A, P_B, P_C, P_D)
6348*c217d954SCole Faust#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \
6349*c217d954SCole Faust    P_X##_DEF(3, P_A, P_B, P_C, P_D);       \
6350*c217d954SCole Faust    REPEAT_4_3(P_X, P_A, P_B, P_C, P_D)
6351*c217d954SCole Faust#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \
6352*c217d954SCole Faust    P_X##_DEF(4, P_A, P_B, P_C, P_D);       \
6353*c217d954SCole Faust    REPEAT_4_4(P_X, P_A, P_B, P_C, P_D)
6354*c217d954SCole Faust#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \
6355*c217d954SCole Faust    P_X##_DEF(5, P_A, P_B, P_C, P_D);       \
6356*c217d954SCole Faust    REPEAT_4_5(P_X, P_A, P_B, P_C, P_D)
6357*c217d954SCole Faust#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \
6358*c217d954SCole Faust    P_X##_DEF(6, P_A, P_B, P_C, P_D);       \
6359*c217d954SCole Faust    REPEAT_4_6(P_X, P_A, P_B, P_C, P_D)
6360*c217d954SCole Faust#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \
6361*c217d954SCole Faust    P_X##_DEF(7, P_A, P_B, P_C, P_D);       \
6362*c217d954SCole Faust    REPEAT_4_7(P_X, P_A, P_B, P_C, P_D)
6363*c217d954SCole Faust#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \
6364*c217d954SCole Faust    P_X##_DEF(8, P_A, P_B, P_C, P_D);       \
6365*c217d954SCole Faust    REPEAT_4_8(P_X, P_A, P_B, P_C, P_D)
6366*c217d954SCole Faust#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \
6367*c217d954SCole Faust    P_X##_DEF(9, P_A, P_B, P_C, P_D);        \
6368*c217d954SCole Faust    REPEAT_4_9(P_X, P_A, P_B, P_C, P_D)
6369*c217d954SCole Faust#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \
6370*c217d954SCole Faust    P_X##_DEF(A, P_A, P_B, P_C, P_D);        \
6371*c217d954SCole Faust    REPEAT_4_10(P_X, P_A, P_B, P_C, P_D)
6372*c217d954SCole Faust#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \
6373*c217d954SCole Faust    P_X##_DEF(B, P_A, P_B, P_C, P_D);        \
6374*c217d954SCole Faust    REPEAT_4_11(P_X, P_A, P_B, P_C, P_D)
6375*c217d954SCole Faust#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \
6376*c217d954SCole Faust    P_X##_DEF(C, P_A, P_B, P_C, P_D);        \
6377*c217d954SCole Faust    REPEAT_4_12(P_X, P_A, P_B, P_C, P_D)
6378*c217d954SCole Faust#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \
6379*c217d954SCole Faust    P_X##_DEF(D, P_A, P_B, P_C, P_D);        \
6380*c217d954SCole Faust    REPEAT_4_13(P_X, P_A, P_B, P_C, P_D)
6381*c217d954SCole Faust#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \
6382*c217d954SCole Faust    P_X##_DEF(E, P_A, P_B, P_C, P_D);        \
6383*c217d954SCole Faust    REPEAT_4_14(P_X, P_A, P_B, P_C, P_D)
6384*c217d954SCole Faust#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \
6385*c217d954SCole Faust    P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
6386*c217d954SCole Faust    REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
6387*c217d954SCole Faust
6388*c217d954SCole Faust#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D)
6389*c217d954SCole Faust#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
6390*c217d954SCole Faust
6391*c217d954SCole Faust
6392*c217d954SCole Faust#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
6393*c217d954SCole Faust#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
6394*c217d954SCole Faust
6395*c217d954SCole Faust
6396*c217d954SCole Faust#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
6397*c217d954SCole Faust#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
6398*c217d954SCole Faust
6399*c217d954SCole Faust
6400*c217d954SCole Faust#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
6401*c217d954SCole Faust#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
6402*c217d954SCole Faust
6403*c217d954SCole Faust
6404*c217d954SCole Faust#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
6405*c217d954SCole Faust#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
6406*c217d954SCole Faust
6407*c217d954SCole Faust
6408*c217d954SCole Faust#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
6409*c217d954SCole Faust#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
6410*c217d954SCole Faust
6411*c217d954SCole Faust
6412*c217d954SCole Faust#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
6413*c217d954SCole Faust#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
6414*c217d954SCole Faust
6415*c217d954SCole Faust
6416*c217d954SCole Faust#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
6417*c217d954SCole Faust#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
6418*c217d954SCole Faust
6419*c217d954SCole Faust
6420*c217d954SCole Faust#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
6421*c217d954SCole Faust#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
6422*c217d954SCole Faust
6423*c217d954SCole Faust
6424*c217d954SCole Faust#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
6425*c217d954SCole Faust#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
6426*c217d954SCole Faust
6427*c217d954SCole Faust
6428*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
6429*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
6430*c217d954SCole Faust
6431*c217d954SCole Faust
6432*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
6433*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
6434*c217d954SCole Faust
6435*c217d954SCole Faust
6436*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
6437*c217d954SCole Faust    ({                                                                                                        \
6438*c217d954SCole Faust        VEC_DATA_TYPE(int, N0)                                                                                \
6439*c217d954SCole Faust        VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \
6440*c217d954SCole Faust        VEC_DATA_TYPE(int, N0)                                                                                \
6441*c217d954SCole Faust        VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
6442*c217d954SCole Faust        VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
6443*c217d954SCole Faust    })
6444*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
6445*c217d954SCole Faust
6446*c217d954SCole Faust#endif
6447*c217d954SCole Faust
6448*c217d954SCole Faust#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
6449*c217d954SCole Faust#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
6450*c217d954SCole Faust
6451*c217d954SCole Faust
6452*c217d954SCole Faust
6453*c217d954SCole Faust
6454*c217d954SCole Faust#define TILE_VECTOR_SIZE1 1
6455*c217d954SCole Faust#define TILE_VECTOR_SIZE2 2
6456*c217d954SCole Faust#define TILE_VECTOR_SIZE3 3
6457*c217d954SCole Faust#define TILE_VECTOR_SIZE4 4
6458*c217d954SCole Faust#define TILE_VECTOR_SIZE5 8
6459*c217d954SCole Faust#define TILE_VECTOR_SIZE6 8
6460*c217d954SCole Faust#define TILE_VECTOR_SIZE7 8
6461*c217d954SCole Faust#define TILE_VECTOR_SIZE8 8
6462*c217d954SCole Faust#define TILE_VECTOR_SIZE9 16
6463*c217d954SCole Faust#define TILE_VECTOR_SIZE10 16
6464*c217d954SCole Faust#define TILE_VECTOR_SIZE11 16
6465*c217d954SCole Faust#define TILE_VECTOR_SIZE12 16
6466*c217d954SCole Faust#define TILE_VECTOR_SIZE13 16
6467*c217d954SCole Faust#define TILE_VECTOR_SIZE14 16
6468*c217d954SCole Faust#define TILE_VECTOR_SIZE15 16
6469*c217d954SCole Faust#define TILE_VECTOR_SIZE16 16
6470*c217d954SCole Faust
6471*c217d954SCole Faust#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
6472*c217d954SCole Faust#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
6473*c217d954SCole Faust#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
6474*c217d954SCole Faust#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
6475*c217d954SCole Faust#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
6476*c217d954SCole Faust#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
6477*c217d954SCole Faust#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
6478*c217d954SCole Faust#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
6479*c217d954SCole Faust#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
6480*c217d954SCole Faust#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
6481*c217d954SCole Faust#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
6482*c217d954SCole Faust#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
6483*c217d954SCole Faust#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
6484*c217d954SCole Faust#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
6485*c217d954SCole Faust#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
6486*c217d954SCole Faust#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
6487*c217d954SCole Faust
6488*c217d954SCole Faust
6489*c217d954SCole Faust#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
6490*c217d954SCole Faust#define TILE_STR(DATA_TYPE, H, W, BASENAME) \
6491*c217d954SCole Faust    union {                                 \
6492*c217d954SCole Faust        DATA_TYPE                      s[TILE_VECTOR_SIZE##W];                  \
6493*c217d954SCole Faust        TILE_VECTOR_TYPE##W(DATA_TYPE) v;                     \
6494*c217d954SCole Faust    } BASENAME[H]
6495*c217d954SCole Faust
6496*c217d954SCole Faust#define TENSOR4D_IMAGE(name)          \
6497*c217d954SCole Faust    __read_only image2d_t name##_img, \
6498*c217d954SCole Faust    __global uchar *name##_ptr,       \
6499*c217d954SCole Faust    uint            name##_stride_x,  \
6500*c217d954SCole Faust    uint            name##_step_x,    \
6501*c217d954SCole Faust    uint            name##_stride_y,  \
6502*c217d954SCole Faust    uint            name##_step_y,    \
6503*c217d954SCole Faust    uint            name##_stride_z,  \
6504*c217d954SCole Faust    uint            name##_step_z,    \
6505*c217d954SCole Faust    uint            name##_stride_w,  \
6506*c217d954SCole Faust    uint            name##_step_w,    \
6507*c217d954SCole Faust    uint            name##_offset_first_element_in_bytes
6508*c217d954SCole Faust
6509*c217d954SCole Faust#define TENSOR4D_BUFFER(name)    \
6510*c217d954SCole Faust    __global uchar *name##_ptr,  \
6511*c217d954SCole Faust    uint        name##_stride_x, \
6512*c217d954SCole Faust    uint        name##_step_x,   \
6513*c217d954SCole Faust    uint        name##_stride_y, \
6514*c217d954SCole Faust    uint        name##_step_y,   \
6515*c217d954SCole Faust    uint        name##_stride_z, \
6516*c217d954SCole Faust    uint        name##_step_z,   \
6517*c217d954SCole Faust    uint        name##_stride_w, \
6518*c217d954SCole Faust    uint        name##_step_w,   \
6519*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6520*c217d954SCole Faust
6521*c217d954SCole Faust#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
6522*c217d954SCole Faust#define TENSOR4D(name, type) TENSOR4D_STR(name, type)
6523*c217d954SCole Faust
6524*c217d954SCole Faust#define TENSOR4D_T_IMAGE(name)          \
6525*c217d954SCole Faust    __read_only image2d_t name##_img, \
6526*c217d954SCole Faust    __global uchar *name##_ptr,       \
6527*c217d954SCole Faust    uint        name##_stride_y, \
6528*c217d954SCole Faust    uint        name##_stride_z, \
6529*c217d954SCole Faust    uint        name##_stride_w, \
6530*c217d954SCole Faust    uint        name##_c,   \
6531*c217d954SCole Faust    uint        name##_w,   \
6532*c217d954SCole Faust    uint        name##_h,   \
6533*c217d954SCole Faust    uint        name##_n,   \
6534*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6535*c217d954SCole Faust
6536*c217d954SCole Faust#define TENSOR4D_T_BUFFER(name)    \
6537*c217d954SCole Faust    __global uchar *name##_ptr,  \
6538*c217d954SCole Faust    uint        name##_stride_y, \
6539*c217d954SCole Faust    uint        name##_stride_z, \
6540*c217d954SCole Faust    uint        name##_stride_w, \
6541*c217d954SCole Faust    uint        name##_c,   \
6542*c217d954SCole Faust    uint        name##_w,   \
6543*c217d954SCole Faust    uint        name##_h,   \
6544*c217d954SCole Faust    uint        name##_n,   \
6545*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6546*c217d954SCole Faust
6547*c217d954SCole Faust#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
6548*c217d954SCole Faust
6549*c217d954SCole Faust
6550*c217d954SCole Faust#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
6551*c217d954SCole Faust
6552*c217d954SCole Faust#define TENSOR4D_RO_T_IMAGE(name)          \
6553*c217d954SCole Faust    __read_only image2d_t name##_img, \
6554*c217d954SCole Faust    TENSOR4D_T_BUFFER(name)
6555*c217d954SCole Faust
6556*c217d954SCole Faust#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
6557*c217d954SCole Faust
6558*c217d954SCole Faust#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
6559*c217d954SCole Faust
6560*c217d954SCole Faust
6561*c217d954SCole Faust#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
6562*c217d954SCole Faust
6563*c217d954SCole Faust#define TENSOR4D_WO_T_IMAGE(name)          \
6564*c217d954SCole Faust    __write_only image2d_t name##_img, \
6565*c217d954SCole Faust    TENSOR4D_T_BUFFER(name)
6566*c217d954SCole Faust
6567*c217d954SCole Faust#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
6568*c217d954SCole Faust
6569*c217d954SCole Faust#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
6570*c217d954SCole Faust
6571*c217d954SCole Faust
6572*c217d954SCole Faust#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
6573*c217d954SCole Faust
6574*c217d954SCole Faust#define TENSOR3D_T_IMAGE(name)          \
6575*c217d954SCole Faust    __read_only image2d_t name##_img, \
6576*c217d954SCole Faust    __global uchar *name##_ptr,       \
6577*c217d954SCole Faust    uint        name##_stride_y, \
6578*c217d954SCole Faust    uint        name##_stride_z, \
6579*c217d954SCole Faust    uint        name##_w,   \
6580*c217d954SCole Faust    uint        name##_h,   \
6581*c217d954SCole Faust    uint        name##_n,   \
6582*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6583*c217d954SCole Faust
6584*c217d954SCole Faust#define TENSOR3D_T_BUFFER(name)    \
6585*c217d954SCole Faust    __global uchar *name##_ptr,  \
6586*c217d954SCole Faust    uint        name##_stride_y, \
6587*c217d954SCole Faust    uint        name##_stride_z, \
6588*c217d954SCole Faust    uint        name##_w,   \
6589*c217d954SCole Faust    uint        name##_h,   \
6590*c217d954SCole Faust    uint        name##_n,   \
6591*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6592*c217d954SCole Faust
6593*c217d954SCole Faust#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
6594*c217d954SCole Faust#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
6595*c217d954SCole Faust
6596*c217d954SCole Faust#if !defined(UNROLL_WITH_PRAGMA)
6597*c217d954SCole Faust#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
6598*c217d954SCole Faust
6599*c217d954SCole Faust#define LOOP_UNROLLING_1(idx, step, macro) (macro)
6600*c217d954SCole Faust#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)
6601*c217d954SCole Faust#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)
6602*c217d954SCole Faust#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)
6603*c217d954SCole Faust#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)
6604*c217d954SCole Faust#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)
6605*c217d954SCole Faust#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)
6606*c217d954SCole Faust#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)
6607*c217d954SCole Faust#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)
6608*c217d954SCole Faust#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)
6609*c217d954SCole Faust#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)
6610*c217d954SCole Faust#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)
6611*c217d954SCole Faust#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)
6612*c217d954SCole Faust#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)
6613*c217d954SCole Faust#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)
6614*c217d954SCole Faust#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)
6615*c217d954SCole Faust#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)
6616*c217d954SCole Faust#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)
6617*c217d954SCole Faust#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)
6618*c217d954SCole Faust#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)
6619*c217d954SCole Faust#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)
6620*c217d954SCole Faust#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)
6621*c217d954SCole Faust#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)
6622*c217d954SCole Faust#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)
6623*c217d954SCole Faust#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)
6624*c217d954SCole Faust#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)
6625*c217d954SCole Faust#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)
6626*c217d954SCole Faust#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)
6627*c217d954SCole Faust#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)
6628*c217d954SCole Faust#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)
6629*c217d954SCole Faust#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)
6630*c217d954SCole Faust#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)
6631*c217d954SCole Faust#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)
6632*c217d954SCole Faust#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)
6633*c217d954SCole Faust#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)
6634*c217d954SCole Faust#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)
6635*c217d954SCole Faust#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)
6636*c217d954SCole Faust#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)
6637*c217d954SCole Faust#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)
6638*c217d954SCole Faust#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)
6639*c217d954SCole Faust#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)
6640*c217d954SCole Faust#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)
6641*c217d954SCole Faust#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)
6642*c217d954SCole Faust#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)
6643*c217d954SCole Faust#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)
6644*c217d954SCole Faust#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)
6645*c217d954SCole Faust#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)
6646*c217d954SCole Faust#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)
6647*c217d954SCole Faust#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)
6648*c217d954SCole Faust#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)
6649*c217d954SCole Faust#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)
6650*c217d954SCole Faust#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)
6651*c217d954SCole Faust#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)
6652*c217d954SCole Faust#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)
6653*c217d954SCole Faust#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)
6654*c217d954SCole Faust#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)
6655*c217d954SCole Faust#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)
6656*c217d954SCole Faust#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)
6657*c217d954SCole Faust#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)
6658*c217d954SCole Faust#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)
6659*c217d954SCole Faust#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)
6660*c217d954SCole Faust#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)
6661*c217d954SCole Faust#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)
6662*c217d954SCole Faust#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)
6663*c217d954SCole Faust#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)
6664*c217d954SCole Faust#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)
6665*c217d954SCole Faust#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)
6666*c217d954SCole Faust#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)
6667*c217d954SCole Faust#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)
6668*c217d954SCole Faust#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)
6669*c217d954SCole Faust#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)
6670*c217d954SCole Faust#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)
6671*c217d954SCole Faust#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)
6672*c217d954SCole Faust#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)
6673*c217d954SCole Faust#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)
6674*c217d954SCole Faust#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)
6675*c217d954SCole Faust#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)
6676*c217d954SCole Faust#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)
6677*c217d954SCole Faust#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)
6678*c217d954SCole Faust#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)
6679*c217d954SCole Faust#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)
6680*c217d954SCole Faust#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)
6681*c217d954SCole Faust#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)
6682*c217d954SCole Faust#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)
6683*c217d954SCole Faust#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)
6684*c217d954SCole Faust#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)
6685*c217d954SCole Faust#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)
6686*c217d954SCole Faust#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)
6687*c217d954SCole Faust#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)
6688*c217d954SCole Faust#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)
6689*c217d954SCole Faust#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)
6690*c217d954SCole Faust#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)
6691*c217d954SCole Faust#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)
6692*c217d954SCole Faust#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)
6693*c217d954SCole Faust#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)
6694*c217d954SCole Faust#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)
6695*c217d954SCole Faust#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)
6696*c217d954SCole Faust#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)
6697*c217d954SCole Faust#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)
6698*c217d954SCole Faust#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)
6699*c217d954SCole Faust#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)
6700*c217d954SCole Faust#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)
6701*c217d954SCole Faust#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)
6702*c217d954SCole Faust#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)
6703*c217d954SCole Faust#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)
6704*c217d954SCole Faust#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)
6705*c217d954SCole Faust#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)
6706*c217d954SCole Faust#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)
6707*c217d954SCole Faust#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)
6708*c217d954SCole Faust#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)
6709*c217d954SCole Faust#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)
6710*c217d954SCole Faust#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)
6711*c217d954SCole Faust#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)
6712*c217d954SCole Faust#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)
6713*c217d954SCole Faust#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)
6714*c217d954SCole Faust#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)
6715*c217d954SCole Faust#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)
6716*c217d954SCole Faust#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)
6717*c217d954SCole Faust#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)
6718*c217d954SCole Faust#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)
6719*c217d954SCole Faust#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)
6720*c217d954SCole Faust#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)
6721*c217d954SCole Faust#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)
6722*c217d954SCole Faust#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)
6723*c217d954SCole Faust#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)
6724*c217d954SCole Faust#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)
6725*c217d954SCole Faust#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)
6726*c217d954SCole Faust#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)
6727*c217d954SCole Faust
6728*c217d954SCole Faust#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
6729*c217d954SCole Faust    {                                                          \
6730*c217d954SCole Faust        type idx = start;                                      \
6731*c217d954SCole Faust        LOOP_UNROLLING_##num(idx, step, macro);                \
6732*c217d954SCole Faust    }
6733*c217d954SCole Faust#else
6734*c217d954SCole Faust#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
6735*c217d954SCole Faust    {                                                          \
6736*c217d954SCole Faust        _Pragma("unroll")                                      \
6737*c217d954SCole Faust        for(type idx = start; idx < (num * step); idx += step) \
6738*c217d954SCole Faust        {                                                      \
6739*c217d954SCole Faust            (macro);                                           \
6740*c217d954SCole Faust        }                                                      \
6741*c217d954SCole Faust    }
6742*c217d954SCole Faust#endif
6743*c217d954SCole Faust#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)
6744*c217d954SCole Faust
6745*c217d954SCole Faust
6746*c217d954SCole Faust#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
6747*c217d954SCole Faust
6748*c217d954SCole Faust
6749*c217d954SCole Faust#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
6750*c217d954SCole Faust#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
6751*c217d954SCole Faust#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6752*c217d954SCole Faust    ({                                                \
6753*c217d954SCole Faust        c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
6754*c217d954SCole Faust    })
6755*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
6756*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
6757*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
6758*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
6759*c217d954SCole Faust#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
6760*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
6761*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
6762*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
6763*c217d954SCole Faust#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
6764*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
6765*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
6766*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
6767*c217d954SCole Faust#else
6768*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
6769*c217d954SCole Faust    ({                                                  \
6770*c217d954SCole Faust        c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
6771*c217d954SCole Faust        c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
6772*c217d954SCole Faust    })
6773*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
6774*c217d954SCole Faust    ({                                                  \
6775*c217d954SCole Faust        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c);  \
6776*c217d954SCole Faust        c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
6777*c217d954SCole Faust    })
6778*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val)   \
6779*c217d954SCole Faust    ({                                                    \
6780*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
6781*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
6782*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
6783*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
6784*c217d954SCole Faust    })
6785*c217d954SCole Faust#endif
6786*c217d954SCole Faust#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6787*c217d954SCole Faust    ({                                                \
6788*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
6789*c217d954SCole Faust        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c);     \
6790*c217d954SCole Faust    })
6791*c217d954SCole Faust#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6792*c217d954SCole Faust    ({                                                \
6793*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
6794*c217d954SCole Faust        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c);     \
6795*c217d954SCole Faust    })
6796*c217d954SCole Faust#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6797*c217d954SCole Faust    ({                                                \
6798*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
6799*c217d954SCole Faust        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c);     \
6800*c217d954SCole Faust    })
6801*c217d954SCole Faust#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6802*c217d954SCole Faust    ({                                                \
6803*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);     \
6804*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);     \
6805*c217d954SCole Faust    })
6806*c217d954SCole Faust#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6807*c217d954SCole Faust    ({                                                \
6808*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6809*c217d954SCole Faust        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c);     \
6810*c217d954SCole Faust    })
6811*c217d954SCole Faust#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6812*c217d954SCole Faust    ({                                                \
6813*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6814*c217d954SCole Faust        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c);     \
6815*c217d954SCole Faust    })
6816*c217d954SCole Faust#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6817*c217d954SCole Faust    ({                                                \
6818*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6819*c217d954SCole Faust        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c);     \
6820*c217d954SCole Faust    })
6821*c217d954SCole Faust#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6822*c217d954SCole Faust    ({                                                \
6823*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6824*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
6825*c217d954SCole Faust    })
6826*c217d954SCole Faust#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6827*c217d954SCole Faust    ({                                                \
6828*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6829*c217d954SCole Faust        DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c);     \
6830*c217d954SCole Faust    })
6831*c217d954SCole Faust#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6832*c217d954SCole Faust    ({                                                \
6833*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6834*c217d954SCole Faust        DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c);     \
6835*c217d954SCole Faust    })
6836*c217d954SCole Faust#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6837*c217d954SCole Faust    ({                                                \
6838*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
6839*c217d954SCole Faust        DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c);     \
6840*c217d954SCole Faust    })
6841*c217d954SCole Faust#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
6842*c217d954SCole Faust    ({                                                 \
6843*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);      \
6844*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);      \
6845*c217d954SCole Faust    })
6846*c217d954SCole Faust
6847*c217d954SCole Faust
6848*c217d954SCole Faust#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
6849*c217d954SCole Faust#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
6850*c217d954SCole Faust
6851*c217d954SCole Faust
6852*c217d954SCole Faust#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
6853*c217d954SCole Faust#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
6854*c217d954SCole Faust#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
6855*c217d954SCole Faust    VLOAD(WIDTH)                                                \
6856*c217d954SCole Faust    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
6857*c217d954SCole Faust#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
6858*c217d954SCole Faust
6859*c217d954SCole Faust
6860*c217d954SCole Faust#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
6861*c217d954SCole Faust#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
6862*c217d954SCole Faust#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
6863*c217d954SCole Faust    VSTORE(WIDTH)                                                \
6864*c217d954SCole Faust    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
6865*c217d954SCole Faust#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
6866*c217d954SCole Faust
6867*c217d954SCole Faust
6868*c217d954SCole Faust#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)                      \
6869*c217d954SCole Faust    ({                                                                                                                 \
6870*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                          \
6871*c217d954SCole Faust        {                                                                                                              \
6872*c217d954SCole Faust            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
6873*c217d954SCole Faust        })                                                                                                             \
6874*c217d954SCole Faust    })
6875*c217d954SCole Faust
6876*c217d954SCole Faust
6877*c217d954SCole Faust#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst)    \
6878*c217d954SCole Faust    ({                                                                                                  \
6879*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                           \
6880*c217d954SCole Faust        {                                                                                               \
6881*c217d954SCole Faust            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
6882*c217d954SCole Faust        })                                                                                              \
6883*c217d954SCole Faust    })
6884*c217d954SCole Faust
6885*c217d954SCole Faust
6886*c217d954SCole Faust#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
6887*c217d954SCole Faust    ({                                                                                                                                                                                             \
6888*c217d954SCole Faust        if(WIDTH1_CONDITION)                                                                                                                                                                       \
6889*c217d954SCole Faust        {                                                                                                                                                                                          \
6890*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
6891*c217d954SCole Faust            {                                                                                                                                                                                      \
6892*c217d954SCole Faust                VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
6893*c217d954SCole Faust                (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
6894*c217d954SCole Faust            })                                                                                                                                                                                     \
6895*c217d954SCole Faust        }                                                                                                                                                                                          \
6896*c217d954SCole Faust        else                                                                                                                                                                                       \
6897*c217d954SCole Faust        {                                                                                                                                                                                          \
6898*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
6899*c217d954SCole Faust            {                                                                                                                                                                                      \
6900*c217d954SCole Faust                dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
6901*c217d954SCole Faust            })                                                                                                                                                                                     \
6902*c217d954SCole Faust        }                                                                                                                                                                                          \
6903*c217d954SCole Faust    })
6904*c217d954SCole Faust
6905*c217d954SCole Faust#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst)   \
6906*c217d954SCole Faust    ({                                                                                                                                                \
6907*c217d954SCole Faust        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT,                                                                                                   \
6908*c217d954SCole Faust        {                                                                                                                                             \
6909*c217d954SCole Faust            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH,                                                                                                \
6910*c217d954SCole Faust            {                                                                                                                                         \
6911*c217d954SCole Faust                int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH);                                                                                \
6912*c217d954SCole Faust                _src_y    += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                        \
6913*c217d954SCole Faust                int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
6914*c217d954SCole Faust                if(_src_valid_y != 0)                                                                                                                 \
6915*c217d954SCole Faust                {                                                                                                                                     \
6916*c217d954SCole Faust                    dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                     \
6917*c217d954SCole Faust                }                                                                                                                                     \
6918*c217d954SCole Faust            })                                                                                                                                        \
6919*c217d954SCole Faust        })                                                                                                                                            \
6920*c217d954SCole Faust    })
6921*c217d954SCole Faust
6922*c217d954SCole Faust
6923*c217d954SCole Faust#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst)         \
6924*c217d954SCole Faust    ({ \
6925*c217d954SCole Faust        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
6926*c217d954SCole Faust        { \
6927*c217d954SCole Faust            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
6928*c217d954SCole Faust            { \
6929*c217d954SCole Faust                int _src_y = (X) + _xk * (DILATION_X); \
6930*c217d954SCole Faust                int _src_z = ((Y) + _yk * (DILATION_Y)); \
6931*c217d954SCole Faust                int _src_w    = (B); \
6932*c217d954SCole Faust                bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
6933*c217d954SCole Faust                if(!(BOUNDARY_CHECK)) \
6934*c217d954SCole Faust                { \
6935*c217d954SCole Faust                    dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
6936*c217d954SCole Faust                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
6937*c217d954SCole Faust                } \
6938*c217d954SCole Faust                else \
6939*c217d954SCole Faust                { \
6940*c217d954SCole Faust                    if(_src_valid_y) \
6941*c217d954SCole Faust                    { \
6942*c217d954SCole Faust                        dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
6943*c217d954SCole Faust                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
6944*c217d954SCole Faust                    }                                                                                                                                                                                                 \
6945*c217d954SCole Faust                } \
6946*c217d954SCole Faust            })                                                                                                                                                                                                             \
6947*c217d954SCole Faust        })                                                                                                                                                                                                             \
6948*c217d954SCole Faust    })
6949*c217d954SCole Faust
6950*c217d954SCole Faust
6951*c217d954SCole Faust#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst)                \
6952*c217d954SCole Faust    ({                                                                                                                                                                \
6953*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
6954*c217d954SCole Faust        {                                                                                                                                                             \
6955*c217d954SCole Faust            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH);                                                                                          \
6956*c217d954SCole Faust            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                                               \
6957*c217d954SCole Faust            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \
6958*c217d954SCole Faust            if(_src_valid_y != 0)                                                                                                                                     \
6959*c217d954SCole Faust            {                                                                                                                                                         \
6960*c217d954SCole Faust                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
6961*c217d954SCole Faust            }                                                                                                                                                         \
6962*c217d954SCole Faust        })                                                                                                                                                            \
6963*c217d954SCole Faust    })
6964*c217d954SCole Faust
6965*c217d954SCole Faust
6966*c217d954SCole Faust#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
6967*c217d954SCole Faust#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
6968*c217d954SCole Faust#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
6969*c217d954SCole Faust    ({ \
6970*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
6971*c217d954SCole Faust        { \
6972*c217d954SCole Faust            if(yi[0].s[_i] >= 0) \
6973*c217d954SCole Faust            { \
6974*c217d954SCole Faust                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
6975*c217d954SCole Faust            } \
6976*c217d954SCole Faust        }) \
6977*c217d954SCole Faust    })
6978*c217d954SCole Faust
6979*c217d954SCole Faust#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
6980*c217d954SCole Faust    ({ \
6981*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
6982*c217d954SCole Faust        { \
6983*c217d954SCole Faust            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
6984*c217d954SCole Faust        }) \
6985*c217d954SCole Faust    })
6986*c217d954SCole Faust
6987*c217d954SCole Faust
6988*c217d954SCole Faust#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
6989*c217d954SCole Faust    ({                                                                                                                                                                \
6990*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
6991*c217d954SCole Faust        {                                                                                                                                                             \
6992*c217d954SCole Faust            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT);                                      \
6993*c217d954SCole Faust            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH);                                                                         \
6994*c217d954SCole Faust            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)   \
6995*c217d954SCole Faust                             && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH));                                                                     \
6996*c217d954SCole Faust            if(_src_valid_y != 0)                                                                                                                                     \
6997*c217d954SCole Faust            {                                                                                                                                                         \
6998*c217d954SCole Faust                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
6999*c217d954SCole Faust            }                                                                                                                                                         \
7000*c217d954SCole Faust        })                                                                                                                                                            \
7001*c217d954SCole Faust    })
7002*c217d954SCole Faust
7003*c217d954SCole Faust
7004*c217d954SCole Faust#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)                                                      \
7005*c217d954SCole Faust    ({                                                                                                                                                                                             \
7006*c217d954SCole Faust        if(WIDTH1_CONDITION)                                                                                                                                                                       \
7007*c217d954SCole Faust        {                                                                                                                                                                                          \
7008*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7009*c217d954SCole Faust            {                                                                                                                                                                                      \
7010*c217d954SCole Faust                VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                                                                     \
7011*c217d954SCole Faust                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
7012*c217d954SCole Faust            })                                                                                                                                                                                     \
7013*c217d954SCole Faust        }                                                                                                                                                                                          \
7014*c217d954SCole Faust        else                                                                                                                                                                                       \
7015*c217d954SCole Faust        {                                                                                                                                                                                          \
7016*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7017*c217d954SCole Faust            {                                                                                                                                                                                      \
7018*c217d954SCole Faust                VSTORE(WIDTH0)                                                                                                                                                                     \
7019*c217d954SCole Faust                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
7020*c217d954SCole Faust            })                                                                                                                                                                                     \
7021*c217d954SCole Faust        }                                                                                                                                                                                          \
7022*c217d954SCole Faust    })
7023*c217d954SCole Faust
7024*c217d954SCole Faust
7025*c217d954SCole Faust#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)        \
7026*c217d954SCole Faust    ({                                                                                               \
7027*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                           \
7028*c217d954SCole Faust        {                                                                                            \
7029*c217d954SCole Faust            ACC_DATA_TYPE _tm = 0;                                                                   \
7030*c217d954SCole Faust            LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                       \
7031*c217d954SCole Faust            {                                                                                        \
7032*c217d954SCole Faust                _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET);                 \
7033*c217d954SCole Faust            })                                                                                       \
7034*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                       \
7035*c217d954SCole Faust            {                                                                                        \
7036*c217d954SCole Faust                dst[_m0].s[_n0] += _tm;                                                              \
7037*c217d954SCole Faust                LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                   \
7038*c217d954SCole Faust                {                                                                                    \
7039*c217d954SCole Faust                    dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
7040*c217d954SCole Faust                })                                                                                   \
7041*c217d954SCole Faust            })                                                                                       \
7042*c217d954SCole Faust        })                                                                                          \
7043*c217d954SCole Faust    })
7044*c217d954SCole Faust
7045*c217d954SCole Faust
7046*c217d954SCole Faust#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
7047*c217d954SCole Faust#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
7048*c217d954SCole Faust
7049*c217d954SCole Faust
7050*c217d954SCole Faust#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
7051*c217d954SCole Faust    ({ \
7052*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7053*c217d954SCole Faust        { \
7054*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7055*c217d954SCole Faust            { \
7056*c217d954SCole Faust                SRC_DATA_TYPE _tmp = 0; \
7057*c217d954SCole Faust                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7058*c217d954SCole Faust                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
7059*c217d954SCole Faust                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
7060*c217d954SCole Faust                long a_64 = (long)(_src); \
7061*c217d954SCole Faust                long b_64 = (long)(DST_MULTIPLIER); \
7062*c217d954SCole Faust                long ab_64 = a_64 * b_64; \
7063*c217d954SCole Faust                long mask1 = 1 << 30; \
7064*c217d954SCole Faust                long mask2 = 1 - (1 << 30); \
7065*c217d954SCole Faust                long is_positive_or_zero = ab_64 >= 0; \
7066*c217d954SCole Faust                long nudge = select(mask2, mask1, is_positive_or_zero); \
7067*c217d954SCole Faust                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7068*c217d954SCole Faust                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7069*c217d954SCole Faust                if(DST_SHIFT >= 0) \
7070*c217d954SCole Faust                { \
7071*c217d954SCole Faust                    long mask = ((((int)1) << DST_SHIFT) - (long)1); \
7072*c217d954SCole Faust                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
7073*c217d954SCole Faust                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
7074*c217d954SCole Faust                } \
7075*c217d954SCole Faust                _tmp += DST_OFFSET; \
7076*c217d954SCole Faust                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7077*c217d954SCole Faust            })                                                                                                                                          \
7078*c217d954SCole Faust        })                                                                                                                                          \
7079*c217d954SCole Faust    })
7080*c217d954SCole Faust
7081*c217d954SCole Faust
7082*c217d954SCole Faust#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
7083*c217d954SCole Faust    ({ \
7084*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7085*c217d954SCole Faust        { \
7086*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7087*c217d954SCole Faust            { \
7088*c217d954SCole Faust                SRC_DATA_TYPE _tmp = 0; \
7089*c217d954SCole Faust                SRC_DATA_TYPE _tmp2 = 0; \
7090*c217d954SCole Faust                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7091*c217d954SCole Faust                SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
7092*c217d954SCole Faust                SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
7093*c217d954SCole Faust                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
7094*c217d954SCole Faust                SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
7095*c217d954SCole Faust                long a_64 = (long)(_src); \
7096*c217d954SCole Faust                long b_64 = (long)(_dst_multiplier); \
7097*c217d954SCole Faust                long ab_64 = a_64 * b_64; \
7098*c217d954SCole Faust                long mask1 = 1 << 30; \
7099*c217d954SCole Faust                long mask2 = 1 - (1 << 30); \
7100*c217d954SCole Faust                long is_positive_or_zero = ab_64 >= 0; \
7101*c217d954SCole Faust                long nudge = select(mask2, mask1, is_positive_or_zero); \
7102*c217d954SCole Faust                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7103*c217d954SCole Faust                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7104*c217d954SCole Faust                long mask = ((((int)1) << _dst_shift) - (int)1); \
7105*c217d954SCole Faust                long threshold = (mask >> 1) + any(_tmp); \
7106*c217d954SCole Faust                _tmp2 = _tmp >> _dst_shift; \
7107*c217d954SCole Faust                _tmp2 += select(0, 1, (_tmp & mask) > threshold); \
7108*c217d954SCole Faust                _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \
7109*c217d954SCole Faust                _tmp += DST_OFFSET; \
7110*c217d954SCole Faust                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7111*c217d954SCole Faust            })                                                                                                                                          \
7112*c217d954SCole Faust        })                                                                                                                                         \
7113*c217d954SCole Faust    })
7114*c217d954SCole Faust
7115*c217d954SCole Faust
7116*c217d954SCole Faust#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
7117*c217d954SCole Faust    ({ \
7118*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7119*c217d954SCole Faust        { \
7120*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7121*c217d954SCole Faust            { \
7122*c217d954SCole Faust                SRC_DATA_TYPE _tmp = 0; \
7123*c217d954SCole Faust                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7124*c217d954SCole Faust                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
7125*c217d954SCole Faust                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
7126*c217d954SCole Faust                long a_64 = (long)(_src); \
7127*c217d954SCole Faust                long b_64 = (long)(DST_MULTIPLIER); \
7128*c217d954SCole Faust                long ab_64 = a_64 * b_64; \
7129*c217d954SCole Faust                long mask1 = 1 << 30; \
7130*c217d954SCole Faust                long mask2 = 1 - (1 << 30); \
7131*c217d954SCole Faust                long is_positive_or_zero = ab_64 >= 0; \
7132*c217d954SCole Faust                long nudge = select(mask2, mask1, is_positive_or_zero); \
7133*c217d954SCole Faust                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7134*c217d954SCole Faust                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7135*c217d954SCole Faust                if(DST_SHIFT >= 0) \
7136*c217d954SCole Faust                { \
7137*c217d954SCole Faust                    long mask = ((((int)1) << DST_SHIFT) - (int)1); \
7138*c217d954SCole Faust                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
7139*c217d954SCole Faust                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
7140*c217d954SCole Faust                } \
7141*c217d954SCole Faust                _tmp += DST_OFFSET; \
7142*c217d954SCole Faust                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7143*c217d954SCole Faust            })                                                                                                                                          \
7144*c217d954SCole Faust        })                                                                                                                                          \
7145*c217d954SCole Faust    })
7146*c217d954SCole Faust
7147*c217d954SCole Faust
7148*c217d954SCole Faust#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask)                                                                                            \
7149*c217d954SCole Faust    ({                                                                                                                                                     \
7150*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                                                                                 \
7151*c217d954SCole Faust        {                                                                                                                                                  \
7152*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                                                                             \
7153*c217d954SCole Faust            {                                                                                                                                              \
7154*c217d954SCole Faust                a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
7155*c217d954SCole Faust            })                                                                                                                                             \
7156*c217d954SCole Faust        })                                                                                                                                                 \
7157*c217d954SCole Faust    })
7158*c217d954SCole Faust
7159*c217d954SCole Faust
7160*c217d954SCole Faust#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst)               \
7161*c217d954SCole Faust    ({                                                                                         \
7162*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                     \
7163*c217d954SCole Faust        {                                                                                      \
7164*c217d954SCole Faust            dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \
7165*c217d954SCole Faust        })                                                                                     \
7166*c217d954SCole Faust    })
7167*c217d954SCole Faust
7168*c217d954SCole Faust
7169*c217d954SCole Faust#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))
7170*c217d954SCole Faust
7171*c217d954SCole Faust#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))
7172*c217d954SCole Faust
7173*c217d954SCole Faust#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
7174*c217d954SCole Faust
7175*c217d954SCole Faust#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
7176*c217d954SCole Faust
7177*c217d954SCole Faust#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)
7178*c217d954SCole Faust
7179*c217d954SCole Faust#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
7180*c217d954SCole Faust#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
7181*c217d954SCole Faust
7182*c217d954SCole Faust#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
7183*c217d954SCole Faust#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
7184*c217d954SCole Faust#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
7185*c217d954SCole Faust#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
7186*c217d954SCole Faust
7187*c217d954SCole Faust
7188*c217d954SCole Faust#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst)               \
7189*c217d954SCole Faust    ({ \
7190*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7191*c217d954SCole Faust        { \
7192*c217d954SCole Faust            dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \
7193*c217d954SCole Faust        })                                                                                          \
7194*c217d954SCole Faust    })
7195*c217d954SCole Faust
7196*c217d954SCole Faust
7197*c217d954SCole Faust#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
7198*c217d954SCole Faust    ({                                                            \
7199*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7200*c217d954SCole Faust        {                                                         \
7201*c217d954SCole Faust            dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
7202*c217d954SCole Faust        })                                                        \
7203*c217d954SCole Faust    })
7204*c217d954SCole Faust
7205*c217d954SCole Faust
7206*c217d954SCole Faust#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
7207*c217d954SCole Faust    ({                                                            \
7208*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7209*c217d954SCole Faust        {                                                         \
7210*c217d954SCole Faust            dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant;               \
7211*c217d954SCole Faust        })                                                        \
7212*c217d954SCole Faust    })
7213*c217d954SCole Faust
7214*c217d954SCole Faust#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7215*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7216*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7217*c217d954SCole Faust
7218*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7219*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7220*c217d954SCole Faust
7221*c217d954SCole Faust#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7222*c217d954SCole Faust
7223*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7224*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7225*c217d954SCole Faust
7226*c217d954SCole Faust
7227*c217d954SCole Faust#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
7228*c217d954SCole Faust    ({                                                            \
7229*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7230*c217d954SCole Faust        {                                                         \
7231*c217d954SCole Faust            dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
7232*c217d954SCole Faust        })                                                        \
7233*c217d954SCole Faust    })
7234*c217d954SCole Faust
7235*c217d954SCole Faust
7236*c217d954SCole Faust#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7237*c217d954SCole Faust    ({                                                      \
7238*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7239*c217d954SCole Faust        {                                                   \
7240*c217d954SCole Faust            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7241*c217d954SCole Faust        })                                                  \
7242*c217d954SCole Faust    })
7243*c217d954SCole Faust
7244*c217d954SCole Faust
7245*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7246*c217d954SCole Faust    ({                                                      \
7247*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7248*c217d954SCole Faust        {                                                   \
7249*c217d954SCole Faust            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7250*c217d954SCole Faust        })                                                  \
7251*c217d954SCole Faust    })
7252*c217d954SCole Faust
7253*c217d954SCole Faust#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7254*c217d954SCole Faust#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7255*c217d954SCole Faust#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7256*c217d954SCole Faust#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7257*c217d954SCole Faust
7258*c217d954SCole Faust
7259*c217d954SCole Faust#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7260*c217d954SCole Faust    ({                                                      \
7261*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7262*c217d954SCole Faust        {                                                   \
7263*c217d954SCole Faust            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7264*c217d954SCole Faust        })                                                  \
7265*c217d954SCole Faust    })
7266*c217d954SCole Faust
7267*c217d954SCole Faust
7268*c217d954SCole Faust#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
7269*c217d954SCole Faust    ({                                                      \
7270*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7271*c217d954SCole Faust        {                                                   \
7272*c217d954SCole Faust            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7273*c217d954SCole Faust        })                                                  \
7274*c217d954SCole Faust    })
7275*c217d954SCole Faust
7276*c217d954SCole Faust
7277*c217d954SCole Faust#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7278*c217d954SCole Faust#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7279*c217d954SCole Faust#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7280*c217d954SCole Faust#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7281*c217d954SCole Faust#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7282*c217d954SCole Faust#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7283*c217d954SCole Faust#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7284*c217d954SCole Faust#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7285*c217d954SCole Faust#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
7286*c217d954SCole Faust    {                                                                                     \
7287*c217d954SCole Faust        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
7288*c217d954SCole Faust        {                                                                                 \
7289*c217d954SCole Faust            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
7290*c217d954SCole Faust            {                                                                             \
7291*c217d954SCole Faust                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
7292*c217d954SCole Faust                {                                                                         \
7293*c217d954SCole Faust                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
7294*c217d954SCole Faust                })                                                                        \
7295*c217d954SCole Faust            })                                                                            \
7296*c217d954SCole Faust        })                                                                                \
7297*c217d954SCole Faust    }
7298*c217d954SCole Faust
7299*c217d954SCole Faust#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
7300*c217d954SCole Faust    ({ \
7301*c217d954SCole Faust        LOOP_UNROLLING(int, _m, 0, 1, M0, \
7302*c217d954SCole Faust        { \
7303*c217d954SCole Faust            LOOP_UNROLLING(int, _n, 0, 1, N0, \
7304*c217d954SCole Faust            { \
7305*c217d954SCole Faust                DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
7306*c217d954SCole Faust            })                                                                                             \
7307*c217d954SCole Faust        })                                                                                             \
7308*c217d954SCole Faust    })
7309*c217d954SCole Faust
7310*c217d954SCole Faust#endif
7311*c217d954SCole Faust
7312*c217d954SCole Faust#if defined(RESHAPE_LHS_NT)
7313*c217d954SCole Faust
7314*c217d954SCole Faust__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_T(src, BUFFER),
7315*c217d954SCole Faust                                         TENSOR3D_T(dst, BUFFER),
7316*c217d954SCole Faust                                         const int M,
7317*c217d954SCole Faust                                         const int V0)
7318*c217d954SCole Faust{
7319*c217d954SCole Faust
7320*c217d954SCole Faust#define BLOCK_SIZE ((M0) * (K0))
7321*c217d954SCole Faust
7322*c217d954SCole Faust
7323*c217d954SCole Faust#if defined(INTERLEAVE)
7324*c217d954SCole Faust#define OUTPUT_OFFSET_X (K0)
7325*c217d954SCole Faust#else
7326*c217d954SCole Faust#define OUTPUT_OFFSET_X (BLOCK_SIZE)
7327*c217d954SCole Faust#endif
7328*c217d954SCole Faust
7329*c217d954SCole Faust
7330*c217d954SCole Faust#if defined(INTERLEAVE)
7331*c217d954SCole Faust#define OUTPUT_STEP_X (K0) * (V0)
7332*c217d954SCole Faust#else
7333*c217d954SCole Faust#define OUTPUT_STEP_X (K0)
7334*c217d954SCole Faust#endif
7335*c217d954SCole Faust
7336*c217d954SCole Faust    const int x = GET_SPATIAL_IDX(0, 1, 0);
7337*c217d954SCole Faust    const int y = GET_SPATIAL_IDX(1, 1, 0);
7338*c217d954SCole Faust    const int z = GET_SPATIAL_IDX(2, 1, 0);
7339*c217d954SCole Faust
7340*c217d954SCole Faust    const int xi = x * K0;
7341*c217d954SCole Faust    const int yi = y * M0;
7342*c217d954SCole Faust
7343*c217d954SCole Faust    const int xo = x * BLOCK_SIZE * V0 + (y % V0) * OUTPUT_OFFSET_X;
7344*c217d954SCole Faust    const int yo = (y / V0);
7345*c217d954SCole Faust
7346*c217d954SCole Faust
7347*c217d954SCole Faust    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
7348*c217d954SCole Faust    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
7349*c217d954SCole Faust
7350*c217d954SCole Faust    TILE(DATA_TYPE, M0, K0, in);
7351*c217d954SCole Faust
7352*c217d954SCole Faust
7353*c217d954SCole Faust    LOOP_UNROLLING(int, _i, 0, 1, M0,
7354*c217d954SCole Faust    {
7355*c217d954SCole Faust        in[_i].v = 0;
7356*c217d954SCole Faust    });
7357*c217d954SCole Faust
7358*c217d954SCole Faust    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
7359*c217d954SCole Faust    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
7360*c217d954SCole Faust
7361*c217d954SCole Faust    TILE(uint, M0, 1, in_indirect_y);
7362*c217d954SCole Faust    LOOP_UNROLLING(int, _i, 0, 1, M0,
7363*c217d954SCole Faust    {
7364*c217d954SCole Faust        in_indirect_y[_i].v = _i;
7365*c217d954SCole Faust
7366*c217d954SCole Faust    });
7367*c217d954SCole Faust#if PARTIAL_M0 != 0
7368*c217d954SCole Faust    if(y_cond)
7369*c217d954SCole Faust    {
7370*c217d954SCole Faust        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
7371*c217d954SCole Faust    }
7372*c217d954SCole Faust    else
7373*c217d954SCole Faust#endif
7374*c217d954SCole Faust    {
7375*c217d954SCole Faust        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
7376*c217d954SCole Faust    }
7377*c217d954SCole Faust
7378*c217d954SCole Faust
7379*c217d954SCole Faust    TILE(uint, M0, 1, dst_indirect_y);
7380*c217d954SCole Faust    LOOP_UNROLLING(int, _i, 0, 1, M0,
7381*c217d954SCole Faust    {
7382*c217d954SCole Faust        dst_indirect_y[_i].v = _i;
7383*c217d954SCole Faust    });
7384*c217d954SCole Faust
7385*c217d954SCole Faust    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
7386*c217d954SCole Faust#undef BLOCK_SIZE
7387*c217d954SCole Faust#undef OUTPUT_OFFSET_X
7388*c217d954SCole Faust#undef OUTPUT_STEP_X
7389*c217d954SCole Faust}
7390*c217d954SCole Faust#endif
7391*c217d954SCole Faust
7392*c217d954SCole Faust#if defined(RESHAPE_LHS_T)
7393*c217d954SCole Faust
7394*c217d954SCole Faust__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_T(src, BUFFER),
7395*c217d954SCole Faust                                        TENSOR3D_T(dst, BUFFER),
7396*c217d954SCole Faust                                        const int M,
7397*c217d954SCole Faust                                        const int V0)
7398*c217d954SCole Faust{
7399*c217d954SCole Faust
7400*c217d954SCole Faust#define BLOCK_SIZE ((M0) * (K0))
7401*c217d954SCole Faust
7402*c217d954SCole Faust
7403*c217d954SCole Faust#if defined(INTERLEAVE)
7404*c217d954SCole Faust#define OUTPUT_OFFSET_X (M0)
7405*c217d954SCole Faust#else
7406*c217d954SCole Faust#define OUTPUT_OFFSET_X (BLOCK_SIZE)
7407*c217d954SCole Faust#endif
7408*c217d954SCole Faust
7409*c217d954SCole Faust
7410*c217d954SCole Faust#if defined(INTERLEAVE)
7411*c217d954SCole Faust#define OUTPUT_STEP_X (M0) * (V0)
7412*c217d954SCole Faust#else
7413*c217d954SCole Faust#define OUTPUT_STEP_X (M0)
7414*c217d954SCole Faust#endif
7415*c217d954SCole Faust
7416*c217d954SCole Faust    const int x = GET_SPATIAL_IDX(0, 1, 0);
7417*c217d954SCole Faust    const int y = GET_SPATIAL_IDX(1, 1, 0);
7418*c217d954SCole Faust    const int z = GET_SPATIAL_IDX(2, 1, 0);
7419*c217d954SCole Faust
7420*c217d954SCole Faust    const int xi = x * K0;
7421*c217d954SCole Faust    const int yi = y * M0;
7422*c217d954SCole Faust
7423*c217d954SCole Faust    const int xo = x * BLOCK_SIZE * V0 + ((y % V0) * OUTPUT_OFFSET_X);
7424*c217d954SCole Faust    const int yo = (y / V0);
7425*c217d954SCole Faust
7426*c217d954SCole Faust
7427*c217d954SCole Faust    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
7428*c217d954SCole Faust    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
7429*c217d954SCole Faust
7430*c217d954SCole Faust    TILE(DATA_TYPE, M0, K0, in);
7431*c217d954SCole Faust    TILE(DATA_TYPE, K0, M0, in_tr);
7432*c217d954SCole Faust
7433*c217d954SCole Faust
7434*c217d954SCole Faust    LOOP_UNROLLING(int, _i, 0, 1, M0,
7435*c217d954SCole Faust    {
7436*c217d954SCole Faust        in[_i].v = 0;
7437*c217d954SCole Faust    });
7438*c217d954SCole Faust
7439*c217d954SCole Faust
7440*c217d954SCole Faust    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
7441*c217d954SCole Faust    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
7442*c217d954SCole Faust
7443*c217d954SCole Faust    TILE(uint, M0, 1, in_indirect_y);
7444*c217d954SCole Faust    LOOP_UNROLLING(int, _i, 0, 1, M0,
7445*c217d954SCole Faust    {
7446*c217d954SCole Faust        in_indirect_y[_i].v = _i;
7447*c217d954SCole Faust
7448*c217d954SCole Faust    });
7449*c217d954SCole Faust#if PARTIAL_M0 != 0
7450*c217d954SCole Faust    if(y_cond)
7451*c217d954SCole Faust    {
7452*c217d954SCole Faust        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
7453*c217d954SCole Faust    }
7454*c217d954SCole Faust    else
7455*c217d954SCole Faust#endif
7456*c217d954SCole Faust    {
7457*c217d954SCole Faust        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
7458*c217d954SCole Faust    }
7459*c217d954SCole Faust
7460*c217d954SCole Faust    LOOP_UNROLLING(int, m0, 0, 1, M0,
7461*c217d954SCole Faust    {
7462*c217d954SCole Faust        LOOP_UNROLLING(int, k0, 0, 1, K0,
7463*c217d954SCole Faust        {
7464*c217d954SCole Faust            in_tr[k0].s[m0] = in[m0].s[k0];
7465*c217d954SCole Faust        })
7466*c217d954SCole Faust    });
7467*c217d954SCole Faust
7468*c217d954SCole Faust    TILE(uint, K0, 1, dst_indirect_y);
7469*c217d954SCole Faust    LOOP_UNROLLING(int, _i, 0, 1, K0,
7470*c217d954SCole Faust    {
7471*c217d954SCole Faust        dst_indirect_y[_i].v = _i;
7472*c217d954SCole Faust    });
7473*c217d954SCole Faust
7474*c217d954SCole Faust
7475*c217d954SCole Faust    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, M0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
7476*c217d954SCole Faust
7477*c217d954SCole Faust#undef BLOCK_SIZE
7478*c217d954SCole Faust#undef OUTPUT_OFFSET_X
7479*c217d954SCole Faust#undef OUTPUT_STEP_X
7480*c217d954SCole Faust}
7481*c217d954SCole Faust#endif
7482*c217d954SCole Faust
7483*c217d954SCole Faust#if defined(RESHAPE_RHS_NT)
7484*c217d954SCole Faust
7485*c217d954SCole Faust__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_T(src, BUFFER),
7486*c217d954SCole Faust                                         TENSOR3D_T(dst, BUFFER),
7487*c217d954SCole Faust                                         const int H0)
7488*c217d954SCole Faust{
7489*c217d954SCole Faust
7490*c217d954SCole Faust#define BLOCK_SIZE ((K0) * (N0))
7491*c217d954SCole Faust
7492*c217d954SCole Faust
7493*c217d954SCole Faust#if defined(INTERLEAVE)
7494*c217d954SCole Faust#define OUTPUT_OFFSET_X (N0)
7495*c217d954SCole Faust#else
7496*c217d954SCole Faust#define OUTPUT_OFFSET_X (BLOCK_SIZE)
7497*c217d954SCole Faust#endif
7498*c217d954SCole Faust
7499*c217d954SCole Faust
7500*c217d954SCole Faust#if defined(INTERLEAVE)
7501*c217d954SCole Faust#define OUTPUT_STEP_X (N0) * (H0)
7502*c217d954SCole Faust#else
7503*c217d954SCole Faust#define OUTPUT_STEP_X (N0)
7504*c217d954SCole Faust#endif
7505*c217d954SCole Faust
7506*c217d954SCole Faust    const int x = GET_SPATIAL_IDX(0, 1, 0);
7507*c217d954SCole Faust    const int y = GET_SPATIAL_IDX(1, 1, 0);
7508*c217d954SCole Faust    const int z = GET_SPATIAL_IDX(2, 1, 0);
7509*c217d954SCole Faust
7510*c217d954SCole Faust    const int xi = x * N0;
7511*c217d954SCole Faust    const int yi = y * K0;
7512*c217d954SCole Faust
7513*c217d954SCole Faust    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
7514*c217d954SCole Faust    const int yo = (x / H0);
7515*c217d954SCole Faust
7516*c217d954SCole Faust    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
7517*c217d954SCole Faust    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
7518*c217d954SCole Faust
7519*c217d954SCole Faust    TILE(DATA_TYPE, K0, N0, in);
7520*c217d954SCole Faust
7521*c217d954SCole Faust
7522*c217d954SCole Faust    for(int i = 0; i < K0; ++i)
7523*c217d954SCole Faust    {
7524*c217d954SCole Faust        in[i].v = 0;
7525*c217d954SCole Faust    }
7526*c217d954SCole Faust
7527*c217d954SCole Faust
7528*c217d954SCole Faust    for(int i = 0; i < K0; ++i)
7529*c217d954SCole Faust    {
7530*c217d954SCole Faust        if(yi + i < src_h)
7531*c217d954SCole Faust        {
7532*c217d954SCole Faust            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
7533*c217d954SCole Faust        }
7534*c217d954SCole Faust    }
7535*c217d954SCole Faust
7536*c217d954SCole Faust    TILE(uint, K0, 1, dst_indirect_y);
7537*c217d954SCole Faust    for(int i = 0; i < K0; ++i)
7538*c217d954SCole Faust    {
7539*c217d954SCole Faust        dst_indirect_y[i].v = i;
7540*c217d954SCole Faust    }
7541*c217d954SCole Faust
7542*c217d954SCole Faust    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
7543*c217d954SCole Faust
7544*c217d954SCole Faust#undef BLOCK_SIZE
7545*c217d954SCole Faust#undef OUTPUT_OFFSET_X
7546*c217d954SCole Faust#undef OUTPUT_STEP_X
7547*c217d954SCole Faust}
7548*c217d954SCole Faust#endif
7549*c217d954SCole Faust
7550*c217d954SCole Faust#if defined(RESHAPE_RHS_T)
7551*c217d954SCole Faust
7552*c217d954SCole Faust__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_T(src, BUFFER),
7553*c217d954SCole Faust                                        TENSOR3D_T(dst, BUFFER),
7554*c217d954SCole Faust                                        const int H0)
7555*c217d954SCole Faust{
7556*c217d954SCole Faust
7557*c217d954SCole Faust#define BLOCK_SIZE ((K0) * (N0))
7558*c217d954SCole Faust
7559*c217d954SCole Faust
7560*c217d954SCole Faust#if defined(INTERLEAVE)
7561*c217d954SCole Faust#define OUTPUT_OFFSET_X (K0)
7562*c217d954SCole Faust#else
7563*c217d954SCole Faust#define OUTPUT_OFFSET_X (BLOCK_SIZE)
7564*c217d954SCole Faust#endif
7565*c217d954SCole Faust
7566*c217d954SCole Faust
7567*c217d954SCole Faust#if defined(INTERLEAVE)
7568*c217d954SCole Faust#define OUTPUT_STEP_X (K0) * (H0)
7569*c217d954SCole Faust#else
7570*c217d954SCole Faust#define OUTPUT_STEP_X (K0)
7571*c217d954SCole Faust#endif
7572*c217d954SCole Faust
7573*c217d954SCole Faust    const int x = GET_SPATIAL_IDX(0, 1, 0);
7574*c217d954SCole Faust    const int y = GET_SPATIAL_IDX(1, 1, 0);
7575*c217d954SCole Faust    const int z = GET_SPATIAL_IDX(2, 1, 0);
7576*c217d954SCole Faust
7577*c217d954SCole Faust    const int xi = x * N0;
7578*c217d954SCole Faust    const int yi = y * K0;
7579*c217d954SCole Faust
7580*c217d954SCole Faust    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
7581*c217d954SCole Faust    const int yo = (x / H0);
7582*c217d954SCole Faust
7583*c217d954SCole Faust    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
7584*c217d954SCole Faust    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
7585*c217d954SCole Faust
7586*c217d954SCole Faust    TILE(DATA_TYPE, K0, N0, in);
7587*c217d954SCole Faust    TILE(DATA_TYPE, N0, K0, in_tr);
7588*c217d954SCole Faust
7589*c217d954SCole Faust
7590*c217d954SCole Faust    for(int i = 0; i < K0; ++i)
7591*c217d954SCole Faust    {
7592*c217d954SCole Faust        in[i].v = 0;
7593*c217d954SCole Faust    }
7594*c217d954SCole Faust
7595*c217d954SCole Faust
7596*c217d954SCole Faust    for(int i = 0; i < K0; ++i)
7597*c217d954SCole Faust    {
7598*c217d954SCole Faust        if(yi + i < src_h)
7599*c217d954SCole Faust        {
7600*c217d954SCole Faust            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
7601*c217d954SCole Faust        }
7602*c217d954SCole Faust    }
7603*c217d954SCole Faust
7604*c217d954SCole Faust
7605*c217d954SCole Faust    for(int k0 = 0; k0 < K0; ++k0)
7606*c217d954SCole Faust    {
7607*c217d954SCole Faust        for(int n0 = 0; n0 < N0; ++n0)
7608*c217d954SCole Faust        {
7609*c217d954SCole Faust            in_tr[n0].s[k0] = in[k0].s[n0];
7610*c217d954SCole Faust        }
7611*c217d954SCole Faust    }
7612*c217d954SCole Faust
7613*c217d954SCole Faust    TILE(uint, N0, 1, dst_indirect_y);
7614*c217d954SCole Faust    for(int i = 0; i < N0; ++i)
7615*c217d954SCole Faust    {
7616*c217d954SCole Faust        dst_indirect_y[i].v = i;
7617*c217d954SCole Faust    }
7618*c217d954SCole Faust
7619*c217d954SCole Faust    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
7620*c217d954SCole Faust
7621*c217d954SCole Faust#undef BLOCK_SIZE
7622*c217d954SCole Faust#undef OUTPUT_OFFSET_X
7623*c217d954SCole Faust#undef OUTPUT_STEP_X
7624*c217d954SCole Faust}
7625*c217d954SCole Faust
7626*c217d954SCole Faust#endif  )"