xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/common/gemm.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1*c217d954SCole FaustR"(
2*c217d954SCole Faust
3*c217d954SCole Faust
4*c217d954SCole Faust
5*c217d954SCole Faust
6*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
7*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
8*c217d954SCole Faust
9*c217d954SCole Faust
10*c217d954SCole Faust
11*c217d954SCole Faust
12*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
13*c217d954SCole Faust    VSTORE(N0)                                                 \
14*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
15*c217d954SCole Faust
16*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
17*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
18*c217d954SCole Faust    VSTORE(N0)                                                 \
19*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
20*c217d954SCole Faust
21*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
22*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
23*c217d954SCole Faust    VSTORE(N0)                                                 \
24*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
25*c217d954SCole Faust
26*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
27*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
28*c217d954SCole Faust    VSTORE(N0)                                                 \
29*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
30*c217d954SCole Faust
31*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
32*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
33*c217d954SCole Faust    VSTORE(N0)                                                 \
34*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
35*c217d954SCole Faust
36*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
37*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
38*c217d954SCole Faust    VSTORE(N0)                                                 \
39*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
40*c217d954SCole Faust
41*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
42*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
43*c217d954SCole Faust    VSTORE(N0)                                                 \
44*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
45*c217d954SCole Faust
46*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
47*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
48*c217d954SCole Faust    VSTORE(N0)                                                 \
49*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
50*c217d954SCole Faust
51*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
52*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
53*c217d954SCole Faust    VSTORE(N0)                                                 \
54*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
55*c217d954SCole Faust
56*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
57*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
58*c217d954SCole Faust    VSTORE(N0)                                                  \
59*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
60*c217d954SCole Faust
61*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
62*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
63*c217d954SCole Faust    VSTORE(N0)                                                  \
64*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
65*c217d954SCole Faust
66*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
67*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
68*c217d954SCole Faust    VSTORE(N0)                                                  \
69*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
70*c217d954SCole Faust
71*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
72*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
73*c217d954SCole Faust    VSTORE(N0)                                                  \
74*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
75*c217d954SCole Faust
76*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
77*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
78*c217d954SCole Faust    VSTORE(N0)                                                  \
79*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
80*c217d954SCole Faust
81*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
82*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
83*c217d954SCole Faust    VSTORE(N0)                                                  \
84*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
85*c217d954SCole Faust
86*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
87*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
88*c217d954SCole Faust    VSTORE(N0)                                                  \
89*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
90*c217d954SCole Faust
91*c217d954SCole Faust
92*c217d954SCole Faust
93*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
94*c217d954SCole Faust    VSTORE(N0)                                                         \
95*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
96*c217d954SCole Faust
97*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
98*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
99*c217d954SCole Faust    VSTORE(N0)                                                         \
100*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
101*c217d954SCole Faust
102*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
103*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
104*c217d954SCole Faust    VSTORE(N0)                                                         \
105*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
106*c217d954SCole Faust
107*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
108*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
109*c217d954SCole Faust    VSTORE(N0)                                                         \
110*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
111*c217d954SCole Faust
112*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
113*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
114*c217d954SCole Faust    VSTORE(N0)                                                         \
115*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
116*c217d954SCole Faust
117*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
118*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
119*c217d954SCole Faust    VSTORE(N0)                                                         \
120*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
121*c217d954SCole Faust
122*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
123*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
124*c217d954SCole Faust    VSTORE(N0)                                                         \
125*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
126*c217d954SCole Faust
127*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
128*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
129*c217d954SCole Faust    VSTORE(N0)                                                         \
130*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
131*c217d954SCole Faust
132*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
133*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
134*c217d954SCole Faust    VSTORE(N0)                                                         \
135*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
136*c217d954SCole Faust
137*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
138*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
139*c217d954SCole Faust    VSTORE(N0)                                                     \
140*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
141*c217d954SCole Faust
142*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
143*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
144*c217d954SCole Faust    VSTORE(N0)                                                          \
145*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
146*c217d954SCole Faust
147*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
148*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
149*c217d954SCole Faust    VSTORE(N0)                                                          \
150*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
151*c217d954SCole Faust
152*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
153*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
154*c217d954SCole Faust    VSTORE(N0)                                                          \
155*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156*c217d954SCole Faust
157*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
158*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
159*c217d954SCole Faust    VSTORE(N0)                                                          \
160*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
161*c217d954SCole Faust
162*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
163*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
164*c217d954SCole Faust    VSTORE(N0)                                                          \
165*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
166*c217d954SCole Faust
167*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
168*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
169*c217d954SCole Faust    VSTORE(N0)                                                          \
170*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
171*c217d954SCole Faust
172*c217d954SCole Faust
173*c217d954SCole Faust
174*c217d954SCole Faust
175*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
176*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
177*c217d954SCole Faust
178*c217d954SCole Faust
179*c217d954SCole Faust
180*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
181*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
182*c217d954SCole Faust
183*c217d954SCole Faust
184*c217d954SCole Faust
185*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
186*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
187*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
188*c217d954SCole Faust
189*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
190*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
191*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
192*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
193*c217d954SCole Faust
194*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
195*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
196*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
197*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
198*c217d954SCole Faust
199*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
200*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
201*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
202*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
203*c217d954SCole Faust
204*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
205*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
206*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
207*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
208*c217d954SCole Faust
209*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
210*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
211*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
212*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
213*c217d954SCole Faust
214*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
215*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
216*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
217*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
218*c217d954SCole Faust
219*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
220*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
221*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
222*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
223*c217d954SCole Faust
224*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
225*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
226*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
227*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
228*c217d954SCole Faust
229*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
230*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
231*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
232*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
233*c217d954SCole Faust
234*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
235*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
236*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
237*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
238*c217d954SCole Faust
239*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
240*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
241*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
242*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
243*c217d954SCole Faust
244*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
245*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
246*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
247*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
248*c217d954SCole Faust
249*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
250*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
251*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
252*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
253*c217d954SCole Faust
254*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
255*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
256*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
257*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
258*c217d954SCole Faust
259*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
260*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
261*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
262*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
263*c217d954SCole Faust
264*c217d954SCole Faust
265*c217d954SCole Faust
266*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
267*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
268*c217d954SCole Faust
269*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
270*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
271*c217d954SCole Faust    {                                                                                                                                                     \
272*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
273*c217d954SCole Faust    }                                                                                                                                                     \
274*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
275*c217d954SCole Faust    {                                                                                                                                                     \
276*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
277*c217d954SCole Faust    }                                                                                                                                                     \
278*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
279*c217d954SCole Faust    {                                                                                                                                                     \
280*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
281*c217d954SCole Faust    }                                                                                                                                                     \
282*c217d954SCole Faust    else                                                                                                                                                  \
283*c217d954SCole Faust    {                                                                                                                                                     \
284*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
285*c217d954SCole Faust    }
286*c217d954SCole Faust
287*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
288*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
289*c217d954SCole Faust    {                                                                                                             \
290*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
291*c217d954SCole Faust    }                                                                                                             \
292*c217d954SCole Faust    else                                                                                                          \
293*c217d954SCole Faust    {                                                                                                             \
294*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
295*c217d954SCole Faust    }
296*c217d954SCole Faust
297*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
298*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
299*c217d954SCole Faust    {                                                                                                             \
300*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
301*c217d954SCole Faust    }                                                                                                             \
302*c217d954SCole Faust    else                                                                                                          \
303*c217d954SCole Faust    {                                                                                                             \
304*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
305*c217d954SCole Faust    }
306*c217d954SCole Faust
307*c217d954SCole Faust
308*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
309*c217d954SCole Faust
310*c217d954SCole Faust
311*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
312*c217d954SCole Faust
313*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
314*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
315*c217d954SCole Faust
316*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
317*c217d954SCole Faust
318*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
319*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
320*c217d954SCole Faust
321*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
322*c217d954SCole Faust
323*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
324*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
325*c217d954SCole Faust
326*c217d954SCole Faust#else
327*c217d954SCole Faust
328*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
329*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
330*c217d954SCole Faust
331*c217d954SCole Faust#endif
332*c217d954SCole Faust
333*c217d954SCole Faust#endif
334*c217d954SCole Faust
335*c217d954SCole Faust
336*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
337*c217d954SCole Faust
338*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
340*c217d954SCole Faust#else
341*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
342*c217d954SCole Faust    ((uint)(y * M0))
343*c217d954SCole Faust#endif
344*c217d954SCole Faust
345*c217d954SCole Faust
346*c217d954SCole Faust
347*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
348*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
349*c217d954SCole Faust
350*c217d954SCole Faust
351*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
352*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
353*c217d954SCole Faust#endif
354*c217d954SCole Faust
355*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
356*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
357*c217d954SCole Faust#endif
358*c217d954SCole Faust
359*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
360*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
361*c217d954SCole Faust#endif
362*c217d954SCole Faust
363*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
364*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
365*c217d954SCole Faust#endif
366*c217d954SCole Faust
367*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
368*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
369*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
370*c217d954SCole Faust
371*c217d954SCole Faust
372*c217d954SCole Faust#define CONCAT(a, b) a##b
373*c217d954SCole Faust
374*c217d954SCole Faust
375*c217d954SCole Faust#define EXPAND(x) x
376*c217d954SCole Faust
377*c217d954SCole Faust
378*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
379*c217d954SCole Faust
380*c217d954SCole Faust
381*c217d954SCole Faust#define REV1(x) ((x))
382*c217d954SCole Faust#define REV2(x) ((x).s10)
383*c217d954SCole Faust#define REV3(x) ((x).s210)
384*c217d954SCole Faust#define REV4(x) ((x).s3210)
385*c217d954SCole Faust#define REV8(x) ((x).s76543210)
386*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
387*c217d954SCole Faust
388*c217d954SCole Faust
389*c217d954SCole Faust
390*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
391*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
392*c217d954SCole Faust
393*c217d954SCole Faust
394*c217d954SCole Faust
395*c217d954SCole Faust#define ROT1_0(x) ((x))
396*c217d954SCole Faust#define ROT1_1(x) ((x))
397*c217d954SCole Faust
398*c217d954SCole Faust#define ROT2_0(x) ((x))
399*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
400*c217d954SCole Faust#define ROT2_2(x) ((x))
401*c217d954SCole Faust
402*c217d954SCole Faust#define ROT3_0(x) ((x))
403*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
404*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
405*c217d954SCole Faust#define ROT3_3(x) ((x))
406*c217d954SCole Faust
407*c217d954SCole Faust#define ROT4_0(x) ((x))
408*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
409*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
410*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
411*c217d954SCole Faust#define ROT4_4(x) ((x))
412*c217d954SCole Faust
413*c217d954SCole Faust#define ROT8_0(x) ((x))
414*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
415*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
416*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
417*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
418*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
419*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
420*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
421*c217d954SCole Faust#define ROT8_8(x) ((x))
422*c217d954SCole Faust
423*c217d954SCole Faust#define ROT16_0(x) ((x))
424*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
425*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
426*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
427*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
428*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
429*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
430*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
431*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
432*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
433*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
434*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
435*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
436*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
437*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
438*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
439*c217d954SCole Faust#define ROT16_16(x) ((x))
440*c217d954SCole Faust
441*c217d954SCole Faust
442*c217d954SCole Faust
443*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
444*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
445*c217d954SCole Faust
446*c217d954SCole Faust
447*c217d954SCole Faust
448*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
449*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
450*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
451*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
452*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
453*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
454*c217d954SCole Faust
455*c217d954SCole Faust
456*c217d954SCole Faust
457*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
458*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
459*c217d954SCole Faust
460*c217d954SCole Faust
461*c217d954SCole Faust#define VLOAD_STR(size) vload##size
462*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
463*c217d954SCole Faust
464*c217d954SCole Faust
465*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
466*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
467*c217d954SCole Faust
468*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
469*c217d954SCole Faust    {                            \
470*c217d954SCole Faust    }
471*c217d954SCole Faust
472*c217d954SCole Faust
473*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
474*c217d954SCole Faust#define vload_partial_1_1 vload1
475*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
476*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
477*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
478*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
479*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
480*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
481*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
482*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
483*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
484*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
485*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
486*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
487*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
488*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
489*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
490*c217d954SCole Faust
491*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
492*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
493*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
494*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
495*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
496*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
497*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
498*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
499*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
500*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
501*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
502*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
503*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
504*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
505*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
506*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
507*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
508*c217d954SCole Faust
509*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
510*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
511*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
512*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
513*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
514*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
515*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
516*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
517*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
518*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
519*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
520*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
521*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
522*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
523*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
524*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
525*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
526*c217d954SCole Faust
527*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
528*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
529*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
530*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
531*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
532*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
533*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
534*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
535*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
536*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
537*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
538*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
539*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
540*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
541*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
542*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
543*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
544*c217d954SCole Faust
545*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
546*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
547*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
548*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
549*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
550*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
551*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
552*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
553*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
554*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
555*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
556*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
557*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
558*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
559*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
560*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
561*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
562*c217d954SCole Faust
563*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
564*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
565*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
566*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
567*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
568*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
569*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
570*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
571*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
572*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
573*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
574*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
575*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
576*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
577*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
578*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
579*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
580*c217d954SCole Faust
581*c217d954SCole Faust
582*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
583*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
584*c217d954SCole Faust
585*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
586*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
587*c217d954SCole Faust
588*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
589*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
590*c217d954SCole Faust
591*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
592*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
593*c217d954SCole Faust
594*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
595*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
596*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
597*c217d954SCole Faust
598*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
599*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
600*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
601*c217d954SCole Faust
602*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
603*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
604*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
605*c217d954SCole Faust
606*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
607*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
608*c217d954SCole Faust
609*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
610*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
611*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
612*c217d954SCole Faust
613*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
614*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
615*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
616*c217d954SCole Faust
617*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
618*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
619*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
620*c217d954SCole Faust
621*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
622*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
623*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
624*c217d954SCole Faust
625*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
626*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
627*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
628*c217d954SCole Faust
629*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
630*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
631*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
632*c217d954SCole Faust
633*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
634*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
635*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
636*c217d954SCole Faust
637*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
638*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
639*c217d954SCole Faust
640*c217d954SCole Faust
641*c217d954SCole Faust
642*c217d954SCole Faust#define PIXEL_UNIT4 1
643*c217d954SCole Faust#define PIXEL_UNIT8 2
644*c217d954SCole Faust#define PIXEL_UNIT16 4
645*c217d954SCole Faust
646*c217d954SCole Faust
647*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
648*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
649*c217d954SCole Faust
650*c217d954SCole Faust
651*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
652*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
653*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
654*c217d954SCole Faust
655*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
656*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
657*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
658*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
659*c217d954SCole Faust#endif
660*c217d954SCole Faust
661*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
662*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
663*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
664*c217d954SCole Faust
665*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
666*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
667*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
668*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
669*c217d954SCole Faust#endif
670*c217d954SCole Faust
671*c217d954SCole Faust
672*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
673*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
674*c217d954SCole Faust
675*c217d954SCole Faust
676*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
677*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
678*c217d954SCole Faust
679*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
680*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
681*c217d954SCole Faust
682*c217d954SCole Faust#define float1 float
683*c217d954SCole Faust#define half1 half
684*c217d954SCole Faust#define char1 char
685*c217d954SCole Faust#define uchar1 uchar
686*c217d954SCole Faust#define short1 short
687*c217d954SCole Faust#define ushort1 ushort
688*c217d954SCole Faust#define int1 int
689*c217d954SCole Faust#define uint1 uint
690*c217d954SCole Faust#define long1 long
691*c217d954SCole Faust#define ulong1 ulong
692*c217d954SCole Faust#define double1 double
693*c217d954SCole Faust
694*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
695*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
696*c217d954SCole Faust
697*c217d954SCole Faust
698*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
699*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
700*c217d954SCole Faust
701*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
702*c217d954SCole Faust    {                             \
703*c217d954SCole Faust    }
704*c217d954SCole Faust
705*c217d954SCole Faust
706*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
707*c217d954SCole Faust#define vstore_partial_1_1 vstore1
708*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
709*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
710*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
711*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
712*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
713*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
714*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
715*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
716*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
717*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
718*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
719*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
720*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
721*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
722*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
723*c217d954SCole Faust
724*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
725*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
726*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
727*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
728*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
729*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
730*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
731*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
732*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
733*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
734*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
735*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
736*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
737*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
738*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
739*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
740*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
741*c217d954SCole Faust
742*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
743*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
744*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
745*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
746*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
747*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
748*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
749*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
750*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
751*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
752*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
753*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
754*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
755*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
756*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
757*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
758*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
759*c217d954SCole Faust
760*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
761*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
762*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
763*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
764*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
765*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
766*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
767*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
768*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
769*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
770*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
771*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
772*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
773*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
774*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
775*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
776*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
777*c217d954SCole Faust
778*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
779*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
780*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
781*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
782*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
783*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
784*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
785*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
786*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
787*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
788*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
789*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
790*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
791*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
792*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
793*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
794*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
795*c217d954SCole Faust
796*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
797*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
798*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
799*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
800*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
801*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
802*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
803*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
804*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
805*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
806*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
807*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
808*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
809*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
810*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
811*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
812*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
813*c217d954SCole Faust
814*c217d954SCole Faust
815*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
816*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
817*c217d954SCole Faust
818*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
819*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
820*c217d954SCole Faust
821*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
822*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
823*c217d954SCole Faust
824*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
825*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
826*c217d954SCole Faust
827*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
828*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
829*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
830*c217d954SCole Faust
831*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
832*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
833*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
834*c217d954SCole Faust
835*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
836*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
837*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
838*c217d954SCole Faust
839*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
840*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
841*c217d954SCole Faust
842*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
843*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
844*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
845*c217d954SCole Faust
846*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
847*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
848*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
849*c217d954SCole Faust
850*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
851*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
852*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
853*c217d954SCole Faust
854*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
855*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
856*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
857*c217d954SCole Faust
858*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
859*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
860*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
861*c217d954SCole Faust
862*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
863*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
864*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
865*c217d954SCole Faust
866*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
867*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
868*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
869*c217d954SCole Faust
870*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
871*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
872*c217d954SCole Faust
873*c217d954SCole Faust
874*c217d954SCole Faust
875*c217d954SCole Faust
876*c217d954SCole Faust
877*c217d954SCole Faust#define convert_float_sat convert_float
878*c217d954SCole Faust#define convert_float1_sat convert_float
879*c217d954SCole Faust#define convert_float2_sat convert_float2
880*c217d954SCole Faust#define convert_float3_sat convert_float3
881*c217d954SCole Faust#define convert_float4_sat convert_float4
882*c217d954SCole Faust#define convert_float8_sat convert_float8
883*c217d954SCole Faust#define convert_float16_sat convert_float16
884*c217d954SCole Faust#define convert_half_sat convert_float
885*c217d954SCole Faust#define convert_half1_sat convert_half
886*c217d954SCole Faust#define convert_half2_sat convert_half2
887*c217d954SCole Faust#define convert_half3_sat convert_half3
888*c217d954SCole Faust#define convert_half4_sat convert_half4
889*c217d954SCole Faust#define convert_half8_sat convert_half8
890*c217d954SCole Faust#define convert_half16_sat convert_half16
891*c217d954SCole Faust
892*c217d954SCole Faust#define convert_float1 convert_float
893*c217d954SCole Faust#define convert_half1 convert_half
894*c217d954SCole Faust#define convert_char1 convert_char
895*c217d954SCole Faust#define convert_uchar1 convert_uchar
896*c217d954SCole Faust#define convert_short1 convert_short
897*c217d954SCole Faust#define convert_ushort1 convert_ushort
898*c217d954SCole Faust#define convert_int1 convert_int
899*c217d954SCole Faust#define convert_uint1 convert_uint
900*c217d954SCole Faust#define convert_long1 convert_long
901*c217d954SCole Faust#define convert_ulong1 convert_ulong
902*c217d954SCole Faust#define convert_double1 convert_double
903*c217d954SCole Faust
904*c217d954SCole Faust#define convert_char1_sat convert_char_sat
905*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
906*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
907*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
908*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
909*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
910*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
911*c217d954SCole Faust#define convert_short1_sat convert_short_sat
912*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
913*c217d954SCole Faust#define convert_int1_sat convert_int_sat
914*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
915*c217d954SCole Faust#define convert_long1_sat convert_long_sat
916*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
917*c217d954SCole Faust#define convert_double1_sat convert_double_sat
918*c217d954SCole Faust
919*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
920*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
921*c217d954SCole Faust
922*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
923*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
924*c217d954SCole Faust
925*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
926*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
927*c217d954SCole Faust
928*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
929*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
930*c217d954SCole Faust
931*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
932*c217d954SCole Faust#define select_vec_dt_char(size) char##size
933*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
934*c217d954SCole Faust#define select_vec_dt_short(size) short##size
935*c217d954SCole Faust#define select_vec_dt_half(size) short##size
936*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
937*c217d954SCole Faust#define select_vec_dt_int(size) int##size
938*c217d954SCole Faust#define select_vec_dt_float(size) int##size
939*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
940*c217d954SCole Faust#define select_vec_dt_long(size) long##size
941*c217d954SCole Faust
942*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
943*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
944*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
945*c217d954SCole Faust
946*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
947*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
948*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
949*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
950*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
951*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
952*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
953*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
954*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
955*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
956*c217d954SCole Faust
957*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
958*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
959*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
960*c217d954SCole Faust
961*c217d954SCole Faust#define sum_reduce_1(x) (x)
962*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
963*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
964*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
965*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
966*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
967*c217d954SCole Faust
968*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
969*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
970*c217d954SCole Faust
971*c217d954SCole Faust#define prod_reduce_1(x) (x)
972*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
973*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
974*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
975*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
976*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
977*c217d954SCole Faust
978*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
979*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
980*c217d954SCole Faust
981*c217d954SCole Faust#define max_reduce_1(x) (x)
982*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
983*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
984*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
985*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
986*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
987*c217d954SCole Faust
988*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
989*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
990*c217d954SCole Faust
991*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
992*c217d954SCole Faust    __global uchar *name##_ptr,      \
993*c217d954SCole Faust    uint        name##_stride_x, \
994*c217d954SCole Faust    uint        name##_step_x,   \
995*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
996*c217d954SCole Faust
997*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
998*c217d954SCole Faust    __global uchar *name##_ptr,      \
999*c217d954SCole Faust    uint        name##_stride_x, \
1000*c217d954SCole Faust    uint        name##_step_x,   \
1001*c217d954SCole Faust    uint        name##_stride_y, \
1002*c217d954SCole Faust    uint        name##_step_y,   \
1003*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1004*c217d954SCole Faust
1005*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
1006*c217d954SCole Faust    __global uchar *name##_ptr,      \
1007*c217d954SCole Faust    uint        name##_stride_x, \
1008*c217d954SCole Faust    uint        name##_step_x,   \
1009*c217d954SCole Faust    uint        name##_stride_y, \
1010*c217d954SCole Faust    uint        name##_step_y,   \
1011*c217d954SCole Faust    uint        name##_stride_z, \
1012*c217d954SCole Faust    uint        name##_step_z,   \
1013*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1014*c217d954SCole Faust
1015*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
1016*c217d954SCole Faust    __global uchar *name##_ptr,      \
1017*c217d954SCole Faust    uint        name##_stride_x, \
1018*c217d954SCole Faust    uint        name##_step_x,   \
1019*c217d954SCole Faust    uint        name##_stride_y, \
1020*c217d954SCole Faust    uint        name##_step_y,   \
1021*c217d954SCole Faust    uint        name##_stride_z, \
1022*c217d954SCole Faust    uint        name##_step_z,   \
1023*c217d954SCole Faust    uint        name##_stride_w, \
1024*c217d954SCole Faust    uint        name##_step_w,   \
1025*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1026*c217d954SCole Faust
1027*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
1028*c217d954SCole Faust    __global uchar *name##_ptr,      \
1029*c217d954SCole Faust    uint        name##_stride_x, \
1030*c217d954SCole Faust    uint        name##_step_x,   \
1031*c217d954SCole Faust    uint        name##_stride_y, \
1032*c217d954SCole Faust    uint        name##_step_y,   \
1033*c217d954SCole Faust    uint        name##_stride_z, \
1034*c217d954SCole Faust    uint        name##_step_z,   \
1035*c217d954SCole Faust    uint        name##_stride_w, \
1036*c217d954SCole Faust    uint        name##_step_w,   \
1037*c217d954SCole Faust    uint        name##_stride_v, \
1038*c217d954SCole Faust    uint        name##_step_v,   \
1039*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1040*c217d954SCole Faust
1041*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
1042*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1043*c217d954SCole Faust
1044*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1045*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1046*c217d954SCole Faust
1047*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
1048*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1049*c217d954SCole Faust
1050*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1051*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1052*c217d954SCole Faust
1053*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1054*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1055*c217d954SCole Faust
1056*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1057*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1058*c217d954SCole Faust
1059*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1060*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1061*c217d954SCole Faust
1062*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1063*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1064*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
1065*c217d954SCole Faust
1066*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1067*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1068*c217d954SCole Faust
1069*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1070*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1071*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1072*c217d954SCole Faust
1073*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1074*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1075*c217d954SCole Faust
1076*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1077*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1078*c217d954SCole Faust                           name##_stride_z, name##_step_z)
1079*c217d954SCole Faust
1080*c217d954SCole Faust
1081*c217d954SCole Fausttypedef struct Vector
1082*c217d954SCole Faust{
1083*c217d954SCole Faust    __global uchar *ptr;
1084*c217d954SCole Faust    int             offset_first_element_in_bytes;
1085*c217d954SCole Faust    int             stride_x;
1086*c217d954SCole Faust} Vector;
1087*c217d954SCole Faust
1088*c217d954SCole Faust
1089*c217d954SCole Fausttypedef struct Image
1090*c217d954SCole Faust{
1091*c217d954SCole Faust    __global uchar *ptr;
1092*c217d954SCole Faust    int             offset_first_element_in_bytes;
1093*c217d954SCole Faust    int             stride_x;
1094*c217d954SCole Faust    int             stride_y;
1095*c217d954SCole Faust} Image;
1096*c217d954SCole Faust
1097*c217d954SCole Faust
1098*c217d954SCole Fausttypedef struct Tensor3D
1099*c217d954SCole Faust{
1100*c217d954SCole Faust    __global uchar *ptr;
1101*c217d954SCole Faust    int             offset_first_element_in_bytes;
1102*c217d954SCole Faust    int             stride_x;
1103*c217d954SCole Faust    int             stride_y;
1104*c217d954SCole Faust    int             stride_z;
1105*c217d954SCole Faust} Tensor3D;
1106*c217d954SCole Faust
1107*c217d954SCole Faust
1108*c217d954SCole Fausttypedef struct Tensor4D
1109*c217d954SCole Faust{
1110*c217d954SCole Faust    __global uchar *ptr;
1111*c217d954SCole Faust    int             offset_first_element_in_bytes;
1112*c217d954SCole Faust    int             stride_x;
1113*c217d954SCole Faust    int             stride_y;
1114*c217d954SCole Faust    int             stride_z;
1115*c217d954SCole Faust    int             stride_w;
1116*c217d954SCole Faust} Tensor4D;
1117*c217d954SCole Faust
1118*c217d954SCole Faust
1119*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1120*c217d954SCole Faust{
1121*c217d954SCole Faust    Vector vector =
1122*c217d954SCole Faust    {
1123*c217d954SCole Faust        .ptr                           = ptr,
1124*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1125*c217d954SCole Faust        .stride_x                      = stride_x,
1126*c217d954SCole Faust    };
1127*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1128*c217d954SCole Faust    return vector;
1129*c217d954SCole Faust}
1130*c217d954SCole Faust
1131*c217d954SCole Faust
1132*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1133*c217d954SCole Faust{
1134*c217d954SCole Faust    Image img =
1135*c217d954SCole Faust    {
1136*c217d954SCole Faust        .ptr                           = ptr,
1137*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1138*c217d954SCole Faust        .stride_x                      = stride_x,
1139*c217d954SCole Faust        .stride_y                      = stride_y
1140*c217d954SCole Faust    };
1141*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1142*c217d954SCole Faust    return img;
1143*c217d954SCole Faust}
1144*c217d954SCole Faust
1145*c217d954SCole Faust
1146*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1147*c217d954SCole Faust{
1148*c217d954SCole Faust    Image img =
1149*c217d954SCole Faust    {
1150*c217d954SCole Faust        .ptr                           = ptr,
1151*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1152*c217d954SCole Faust        .stride_x                      = stride_x,
1153*c217d954SCole Faust        .stride_y                      = stride_y
1154*c217d954SCole Faust    };
1155*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1156*c217d954SCole Faust    return img;
1157*c217d954SCole Faust}
1158*c217d954SCole Faust
1159*c217d954SCole Faust
1160*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1161*c217d954SCole Faust{
1162*c217d954SCole Faust    Tensor3D tensor =
1163*c217d954SCole Faust    {
1164*c217d954SCole Faust        .ptr                           = ptr,
1165*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1166*c217d954SCole Faust        .stride_x                      = stride_x,
1167*c217d954SCole Faust        .stride_y                      = stride_y,
1168*c217d954SCole Faust        .stride_z                      = stride_z
1169*c217d954SCole Faust    };
1170*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1171*c217d954SCole Faust    return tensor;
1172*c217d954SCole Faust}
1173*c217d954SCole Faust
1174*c217d954SCole Faust
1175*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1176*c217d954SCole Faust{
1177*c217d954SCole Faust    Tensor3D tensor =
1178*c217d954SCole Faust    {
1179*c217d954SCole Faust        .ptr                           = ptr,
1180*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1181*c217d954SCole Faust        .stride_x                      = stride_x,
1182*c217d954SCole Faust        .stride_y                      = stride_y,
1183*c217d954SCole Faust        .stride_z                      = stride_z
1184*c217d954SCole Faust    };
1185*c217d954SCole Faust    return tensor;
1186*c217d954SCole Faust}
1187*c217d954SCole Faust
1188*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1189*c217d954SCole Faust                                             uint step_w,
1190*c217d954SCole Faust                                             uint mod_size)
1191*c217d954SCole Faust{
1192*c217d954SCole Faust    Tensor4D tensor =
1193*c217d954SCole Faust    {
1194*c217d954SCole Faust        .ptr                           = ptr,
1195*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1196*c217d954SCole Faust        .stride_x                      = stride_x,
1197*c217d954SCole Faust        .stride_y                      = stride_y,
1198*c217d954SCole Faust        .stride_z                      = stride_z,
1199*c217d954SCole Faust        .stride_w                      = stride_w
1200*c217d954SCole Faust    };
1201*c217d954SCole Faust
1202*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1203*c217d954SCole Faust    return tensor;
1204*c217d954SCole Faust}
1205*c217d954SCole Faust
1206*c217d954SCole Faust
1207*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
1208*c217d954SCole Faust{
1209*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
1210*c217d954SCole Faust}
1211*c217d954SCole Faust
1212*c217d954SCole Faust
1213*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
1214*c217d954SCole Faust{
1215*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
1216*c217d954SCole Faust}
1217*c217d954SCole Faust
1218*c217d954SCole Faust
1219*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1220*c217d954SCole Faust{
1221*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1222*c217d954SCole Faust}
1223*c217d954SCole Faust
1224*c217d954SCole Faust
1225*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1226*c217d954SCole Faust{
1227*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1228*c217d954SCole Faust}
1229*c217d954SCole Faust
1230*c217d954SCole Faust
1231*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1232*c217d954SCole Faust{
1233*c217d954SCole Faust    uint num_elements = width * height;
1234*c217d954SCole Faust
1235*c217d954SCole Faust    const uint z = index / num_elements;
1236*c217d954SCole Faust
1237*c217d954SCole Faust    index %= num_elements;
1238*c217d954SCole Faust
1239*c217d954SCole Faust    const uint y = index / width;
1240*c217d954SCole Faust
1241*c217d954SCole Faust    index %= width;
1242*c217d954SCole Faust
1243*c217d954SCole Faust    const uint x = index;
1244*c217d954SCole Faust
1245*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1246*c217d954SCole Faust}
1247*c217d954SCole Faust
1248*c217d954SCole Faust#endif
1249*c217d954SCole Faust
1250*c217d954SCole Faust#if GPU_ARCH == GPU_ARCH_BIFROST
1251*c217d954SCole Faust#define MLA(a, b, c) (fma(c, b, a))
1252*c217d954SCole Faust#else
1253*c217d954SCole Faust#define MLA(a, b, c) ((b) * (c) + (a))
1254*c217d954SCole Faust#endif
1255*c217d954SCole Faust
1256*c217d954SCole Faust
1257*c217d954SCole Faust#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
1258*c217d954SCole Faust
1259*c217d954SCole Faust
1260*c217d954SCole Faust#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
1261*c217d954SCole Faust
1262*c217d954SCole Faust
1263*c217d954SCole Faust#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
1264*c217d954SCole Faust
1265*c217d954SCole Faust
1266*c217d954SCole Faust#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
1267*c217d954SCole Faust
1268*c217d954SCole Faust
1269*c217d954SCole Faust#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
1270*c217d954SCole Faust
1271*c217d954SCole Faust
1272*c217d954SCole Faust#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1273*c217d954SCole Faust
1274*c217d954SCole Faust
1275*c217d954SCole Faust#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
1276*c217d954SCole Faust
1277*c217d954SCole Faust
1278*c217d954SCole Faust#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
1279*c217d954SCole Faust
1280*c217d954SCole Faust
1281*c217d954SCole Faust#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
1282*c217d954SCole Faust
1283*c217d954SCole Faust
1284*c217d954SCole Faust#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
1285*c217d954SCole Faust
1286*c217d954SCole Faust
1287*c217d954SCole Faust#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
1288*c217d954SCole Faust
1289*c217d954SCole Faust
1290*c217d954SCole Faust#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
1291*c217d954SCole Faust
1292*c217d954SCole Faust
1293*c217d954SCole Faust#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
1294*c217d954SCole Faust
1295*c217d954SCole Faust
1296*c217d954SCole Faust#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
1297*c217d954SCole Faust
1298*c217d954SCole Faust
1299*c217d954SCole Faust#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
1300*c217d954SCole Faust
1301*c217d954SCole Faust#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1302*c217d954SCole Faust
1303*c217d954SCole Faust#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1304*c217d954SCole Faust
1305*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
1306*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
1307*c217d954SCole Faust
1308*c217d954SCole Faust
1309*c217d954SCole Faust
1310*c217d954SCole Faust
1311*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1312*c217d954SCole Faust    VSTORE(N0)                                                 \
1313*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1314*c217d954SCole Faust
1315*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1316*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1317*c217d954SCole Faust    VSTORE(N0)                                                 \
1318*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1319*c217d954SCole Faust
1320*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1321*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1322*c217d954SCole Faust    VSTORE(N0)                                                 \
1323*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1324*c217d954SCole Faust
1325*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1326*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1327*c217d954SCole Faust    VSTORE(N0)                                                 \
1328*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1329*c217d954SCole Faust
1330*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1331*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1332*c217d954SCole Faust    VSTORE(N0)                                                 \
1333*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1334*c217d954SCole Faust
1335*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1336*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1337*c217d954SCole Faust    VSTORE(N0)                                                 \
1338*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1339*c217d954SCole Faust
1340*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1341*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1342*c217d954SCole Faust    VSTORE(N0)                                                 \
1343*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1344*c217d954SCole Faust
1345*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1346*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1347*c217d954SCole Faust    VSTORE(N0)                                                 \
1348*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1349*c217d954SCole Faust
1350*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1351*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1352*c217d954SCole Faust    VSTORE(N0)                                                 \
1353*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1354*c217d954SCole Faust
1355*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1356*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1357*c217d954SCole Faust    VSTORE(N0)                                                  \
1358*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1359*c217d954SCole Faust
1360*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1361*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1362*c217d954SCole Faust    VSTORE(N0)                                                  \
1363*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1364*c217d954SCole Faust
1365*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1366*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1367*c217d954SCole Faust    VSTORE(N0)                                                  \
1368*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1369*c217d954SCole Faust
1370*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1371*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1372*c217d954SCole Faust    VSTORE(N0)                                                  \
1373*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1374*c217d954SCole Faust
1375*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1376*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1377*c217d954SCole Faust    VSTORE(N0)                                                  \
1378*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1379*c217d954SCole Faust
1380*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1381*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1382*c217d954SCole Faust    VSTORE(N0)                                                  \
1383*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1384*c217d954SCole Faust
1385*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1386*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1387*c217d954SCole Faust    VSTORE(N0)                                                  \
1388*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1389*c217d954SCole Faust
1390*c217d954SCole Faust
1391*c217d954SCole Faust
1392*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1393*c217d954SCole Faust    VSTORE(N0)                                                         \
1394*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1395*c217d954SCole Faust
1396*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1397*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1398*c217d954SCole Faust    VSTORE(N0)                                                         \
1399*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1400*c217d954SCole Faust
1401*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1402*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1403*c217d954SCole Faust    VSTORE(N0)                                                         \
1404*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1405*c217d954SCole Faust
1406*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1407*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1408*c217d954SCole Faust    VSTORE(N0)                                                         \
1409*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1410*c217d954SCole Faust
1411*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1412*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1413*c217d954SCole Faust    VSTORE(N0)                                                         \
1414*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1415*c217d954SCole Faust
1416*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1417*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1418*c217d954SCole Faust    VSTORE(N0)                                                         \
1419*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1420*c217d954SCole Faust
1421*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1422*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1423*c217d954SCole Faust    VSTORE(N0)                                                         \
1424*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1425*c217d954SCole Faust
1426*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1427*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1428*c217d954SCole Faust    VSTORE(N0)                                                         \
1429*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1430*c217d954SCole Faust
1431*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1432*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1433*c217d954SCole Faust    VSTORE(N0)                                                         \
1434*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1435*c217d954SCole Faust
1436*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1437*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1438*c217d954SCole Faust    VSTORE(N0)                                                     \
1439*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1440*c217d954SCole Faust
1441*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1442*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1443*c217d954SCole Faust    VSTORE(N0)                                                          \
1444*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1445*c217d954SCole Faust
1446*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1447*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1448*c217d954SCole Faust    VSTORE(N0)                                                          \
1449*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1450*c217d954SCole Faust
1451*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1452*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1453*c217d954SCole Faust    VSTORE(N0)                                                          \
1454*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1455*c217d954SCole Faust
1456*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1457*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1458*c217d954SCole Faust    VSTORE(N0)                                                          \
1459*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1460*c217d954SCole Faust
1461*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1462*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1463*c217d954SCole Faust    VSTORE(N0)                                                          \
1464*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1465*c217d954SCole Faust
1466*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1467*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1468*c217d954SCole Faust    VSTORE(N0)                                                          \
1469*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1470*c217d954SCole Faust
1471*c217d954SCole Faust
1472*c217d954SCole Faust
1473*c217d954SCole Faust
1474*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1475*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1476*c217d954SCole Faust
1477*c217d954SCole Faust
1478*c217d954SCole Faust
1479*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1480*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1481*c217d954SCole Faust
1482*c217d954SCole Faust
1483*c217d954SCole Faust
1484*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1486*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1487*c217d954SCole Faust
1488*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1489*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1490*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1491*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1492*c217d954SCole Faust
1493*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1494*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1495*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1496*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1497*c217d954SCole Faust
1498*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1499*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1500*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1501*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1502*c217d954SCole Faust
1503*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1504*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1505*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1506*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1507*c217d954SCole Faust
1508*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1509*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1510*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1511*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1512*c217d954SCole Faust
1513*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1514*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1515*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1516*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1517*c217d954SCole Faust
1518*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1519*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1520*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1521*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1522*c217d954SCole Faust
1523*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1524*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1525*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1526*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1527*c217d954SCole Faust
1528*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1529*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1530*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1531*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1532*c217d954SCole Faust
1533*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1534*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1535*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1536*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1537*c217d954SCole Faust
1538*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1539*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1540*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1541*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1542*c217d954SCole Faust
1543*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1544*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1545*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1546*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1547*c217d954SCole Faust
1548*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1549*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1550*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1551*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1552*c217d954SCole Faust
1553*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1554*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1555*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1556*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1557*c217d954SCole Faust
1558*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1559*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1560*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1561*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1562*c217d954SCole Faust
1563*c217d954SCole Faust
1564*c217d954SCole Faust
1565*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1566*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1567*c217d954SCole Faust
1568*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1569*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1570*c217d954SCole Faust    {                                                                                                                                                     \
1571*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1572*c217d954SCole Faust    }                                                                                                                                                     \
1573*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1574*c217d954SCole Faust    {                                                                                                                                                     \
1575*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1576*c217d954SCole Faust    }                                                                                                                                                     \
1577*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1578*c217d954SCole Faust    {                                                                                                                                                     \
1579*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1580*c217d954SCole Faust    }                                                                                                                                                     \
1581*c217d954SCole Faust    else                                                                                                                                                  \
1582*c217d954SCole Faust    {                                                                                                                                                     \
1583*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1584*c217d954SCole Faust    }
1585*c217d954SCole Faust
1586*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1587*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
1588*c217d954SCole Faust    {                                                                                                             \
1589*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1590*c217d954SCole Faust    }                                                                                                             \
1591*c217d954SCole Faust    else                                                                                                          \
1592*c217d954SCole Faust    {                                                                                                             \
1593*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1594*c217d954SCole Faust    }
1595*c217d954SCole Faust
1596*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1597*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
1598*c217d954SCole Faust    {                                                                                                             \
1599*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1600*c217d954SCole Faust    }                                                                                                             \
1601*c217d954SCole Faust    else                                                                                                          \
1602*c217d954SCole Faust    {                                                                                                             \
1603*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1604*c217d954SCole Faust    }
1605*c217d954SCole Faust
1606*c217d954SCole Faust
1607*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1608*c217d954SCole Faust
1609*c217d954SCole Faust
1610*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1611*c217d954SCole Faust
1612*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1613*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1614*c217d954SCole Faust
1615*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1616*c217d954SCole Faust
1617*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1618*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1619*c217d954SCole Faust
1620*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1621*c217d954SCole Faust
1622*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1623*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1624*c217d954SCole Faust
1625*c217d954SCole Faust#else
1626*c217d954SCole Faust
1627*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1628*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1629*c217d954SCole Faust
1630*c217d954SCole Faust#endif
1631*c217d954SCole Faust
1632*c217d954SCole Faust#endif
1633*c217d954SCole Faust
1634*c217d954SCole Faust
1635*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
1636*c217d954SCole Faust
1637*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1638*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1639*c217d954SCole Faust#else
1640*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1641*c217d954SCole Faust    ((uint)(y * M0))
1642*c217d954SCole Faust#endif
1643*c217d954SCole Faust
1644*c217d954SCole Faust
1645*c217d954SCole Faust
1646*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1647*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1648*c217d954SCole Faust
1649*c217d954SCole Faust
1650*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1651*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1652*c217d954SCole Faust#endif
1653*c217d954SCole Faust
1654*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1655*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1656*c217d954SCole Faust#endif
1657*c217d954SCole Faust
1658*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1659*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1660*c217d954SCole Faust#endif
1661*c217d954SCole Faust
1662*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1663*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
1664*c217d954SCole Faust#endif
1665*c217d954SCole Faust
1666*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
1667*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
1668*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
1669*c217d954SCole Faust
1670*c217d954SCole Faust
1671*c217d954SCole Faust#define CONCAT(a, b) a##b
1672*c217d954SCole Faust
1673*c217d954SCole Faust
1674*c217d954SCole Faust#define EXPAND(x) x
1675*c217d954SCole Faust
1676*c217d954SCole Faust
1677*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1678*c217d954SCole Faust
1679*c217d954SCole Faust
1680*c217d954SCole Faust#define REV1(x) ((x))
1681*c217d954SCole Faust#define REV2(x) ((x).s10)
1682*c217d954SCole Faust#define REV3(x) ((x).s210)
1683*c217d954SCole Faust#define REV4(x) ((x).s3210)
1684*c217d954SCole Faust#define REV8(x) ((x).s76543210)
1685*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
1686*c217d954SCole Faust
1687*c217d954SCole Faust
1688*c217d954SCole Faust
1689*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
1690*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
1691*c217d954SCole Faust
1692*c217d954SCole Faust
1693*c217d954SCole Faust
1694*c217d954SCole Faust#define ROT1_0(x) ((x))
1695*c217d954SCole Faust#define ROT1_1(x) ((x))
1696*c217d954SCole Faust
1697*c217d954SCole Faust#define ROT2_0(x) ((x))
1698*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
1699*c217d954SCole Faust#define ROT2_2(x) ((x))
1700*c217d954SCole Faust
1701*c217d954SCole Faust#define ROT3_0(x) ((x))
1702*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
1703*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
1704*c217d954SCole Faust#define ROT3_3(x) ((x))
1705*c217d954SCole Faust
1706*c217d954SCole Faust#define ROT4_0(x) ((x))
1707*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
1708*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
1709*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
1710*c217d954SCole Faust#define ROT4_4(x) ((x))
1711*c217d954SCole Faust
1712*c217d954SCole Faust#define ROT8_0(x) ((x))
1713*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
1714*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
1715*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
1716*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
1717*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
1718*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
1719*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
1720*c217d954SCole Faust#define ROT8_8(x) ((x))
1721*c217d954SCole Faust
1722*c217d954SCole Faust#define ROT16_0(x) ((x))
1723*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
1724*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
1725*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
1726*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
1727*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
1728*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
1729*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
1730*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
1731*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
1732*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
1733*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
1734*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
1735*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
1736*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
1737*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
1738*c217d954SCole Faust#define ROT16_16(x) ((x))
1739*c217d954SCole Faust
1740*c217d954SCole Faust
1741*c217d954SCole Faust
1742*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1743*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1744*c217d954SCole Faust
1745*c217d954SCole Faust
1746*c217d954SCole Faust
1747*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
1748*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
1749*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1750*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1751*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1752*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1753*c217d954SCole Faust
1754*c217d954SCole Faust
1755*c217d954SCole Faust
1756*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1757*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1758*c217d954SCole Faust
1759*c217d954SCole Faust
1760*c217d954SCole Faust#define VLOAD_STR(size) vload##size
1761*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
1762*c217d954SCole Faust
1763*c217d954SCole Faust
1764*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1765*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1766*c217d954SCole Faust
1767*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
1768*c217d954SCole Faust    {                            \
1769*c217d954SCole Faust    }
1770*c217d954SCole Faust
1771*c217d954SCole Faust
1772*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
1773*c217d954SCole Faust#define vload_partial_1_1 vload1
1774*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
1775*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
1776*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
1777*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
1778*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
1779*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
1780*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
1781*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
1782*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
1783*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
1784*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
1785*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
1786*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
1787*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
1788*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
1789*c217d954SCole Faust
1790*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
1791*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
1792*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
1793*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
1794*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
1795*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
1796*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
1797*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
1798*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
1799*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
1800*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
1801*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
1802*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
1803*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
1804*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
1805*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
1806*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
1807*c217d954SCole Faust
1808*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
1809*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
1810*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
1811*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
1812*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
1813*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
1814*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
1815*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
1816*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
1817*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
1818*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
1819*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
1820*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
1821*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
1822*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
1823*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
1824*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
1825*c217d954SCole Faust
1826*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
1827*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
1828*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
1829*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
1830*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
1831*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
1832*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
1833*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
1834*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
1835*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
1836*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
1837*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
1838*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
1839*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
1840*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
1841*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
1842*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
1843*c217d954SCole Faust
1844*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
1845*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
1846*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
1847*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
1848*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
1849*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
1850*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
1851*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
1852*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
1853*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
1854*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
1855*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
1856*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
1857*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
1858*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
1859*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
1860*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
1861*c217d954SCole Faust
1862*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
1863*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
1864*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
1865*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
1866*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
1867*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
1868*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
1869*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
1870*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
1871*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
1872*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
1873*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
1874*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
1875*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
1876*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
1877*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
1878*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
1879*c217d954SCole Faust
1880*c217d954SCole Faust
1881*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
1882*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
1883*c217d954SCole Faust
1884*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
1885*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
1886*c217d954SCole Faust
1887*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
1888*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
1889*c217d954SCole Faust
1890*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
1891*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
1892*c217d954SCole Faust
1893*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
1894*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1895*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
1896*c217d954SCole Faust
1897*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
1898*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1899*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1900*c217d954SCole Faust
1901*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
1902*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1903*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1904*c217d954SCole Faust
1905*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
1906*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
1907*c217d954SCole Faust
1908*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
1909*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1910*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
1911*c217d954SCole Faust
1912*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
1913*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1914*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1915*c217d954SCole Faust
1916*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
1917*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1918*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1919*c217d954SCole Faust
1920*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
1921*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1922*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1923*c217d954SCole Faust
1924*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
1925*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1926*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1927*c217d954SCole Faust
1928*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
1929*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1930*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1931*c217d954SCole Faust
1932*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
1933*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1934*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1935*c217d954SCole Faust
1936*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
1937*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
1938*c217d954SCole Faust
1939*c217d954SCole Faust
1940*c217d954SCole Faust
1941*c217d954SCole Faust#define PIXEL_UNIT4 1
1942*c217d954SCole Faust#define PIXEL_UNIT8 2
1943*c217d954SCole Faust#define PIXEL_UNIT16 4
1944*c217d954SCole Faust
1945*c217d954SCole Faust
1946*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1947*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1948*c217d954SCole Faust
1949*c217d954SCole Faust
1950*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1951*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1952*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1953*c217d954SCole Faust
1954*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1955*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1956*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1957*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1958*c217d954SCole Faust#endif
1959*c217d954SCole Faust
1960*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1961*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1962*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1963*c217d954SCole Faust
1964*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1965*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1966*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1967*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1968*c217d954SCole Faust#endif
1969*c217d954SCole Faust
1970*c217d954SCole Faust
1971*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1972*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1973*c217d954SCole Faust
1974*c217d954SCole Faust
1975*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1976*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1977*c217d954SCole Faust
1978*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
1979*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
1980*c217d954SCole Faust
1981*c217d954SCole Faust#define float1 float
1982*c217d954SCole Faust#define half1 half
1983*c217d954SCole Faust#define char1 char
1984*c217d954SCole Faust#define uchar1 uchar
1985*c217d954SCole Faust#define short1 short
1986*c217d954SCole Faust#define ushort1 ushort
1987*c217d954SCole Faust#define int1 int
1988*c217d954SCole Faust#define uint1 uint
1989*c217d954SCole Faust#define long1 long
1990*c217d954SCole Faust#define ulong1 ulong
1991*c217d954SCole Faust#define double1 double
1992*c217d954SCole Faust
1993*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1994*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1995*c217d954SCole Faust
1996*c217d954SCole Faust
1997*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1998*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1999*c217d954SCole Faust
2000*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
2001*c217d954SCole Faust    {                             \
2002*c217d954SCole Faust    }
2003*c217d954SCole Faust
2004*c217d954SCole Faust
2005*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
2006*c217d954SCole Faust#define vstore_partial_1_1 vstore1
2007*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
2008*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
2009*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
2010*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
2011*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
2012*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
2013*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
2014*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
2015*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
2016*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
2017*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
2018*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
2019*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
2020*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
2021*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
2022*c217d954SCole Faust
2023*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
2024*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
2025*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
2026*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
2027*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
2028*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
2029*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
2030*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
2031*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
2032*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
2033*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
2034*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
2035*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
2036*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
2037*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
2038*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
2039*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
2040*c217d954SCole Faust
2041*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
2042*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
2043*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
2044*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
2045*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
2046*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
2047*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
2048*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
2049*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
2050*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
2051*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
2052*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
2053*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
2054*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
2055*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
2056*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
2057*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
2058*c217d954SCole Faust
2059*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
2060*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
2061*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
2062*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
2063*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
2064*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
2065*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
2066*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
2067*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
2068*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
2069*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
2070*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
2071*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
2072*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
2073*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
2074*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
2075*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
2076*c217d954SCole Faust
2077*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
2078*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
2079*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
2080*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
2081*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
2082*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
2083*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
2084*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
2085*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
2086*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
2087*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
2088*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
2089*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
2090*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
2091*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
2092*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
2093*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
2094*c217d954SCole Faust
2095*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
2096*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
2097*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
2098*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
2099*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
2100*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
2101*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
2102*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
2103*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
2104*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
2105*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
2106*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
2107*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
2108*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
2109*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
2110*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
2111*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
2112*c217d954SCole Faust
2113*c217d954SCole Faust
2114*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
2115*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
2116*c217d954SCole Faust
2117*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
2118*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
2119*c217d954SCole Faust
2120*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
2121*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
2122*c217d954SCole Faust
2123*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
2124*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
2125*c217d954SCole Faust
2126*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
2127*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2128*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
2129*c217d954SCole Faust
2130*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
2131*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2132*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2133*c217d954SCole Faust
2134*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
2135*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2136*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2137*c217d954SCole Faust
2138*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
2139*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
2140*c217d954SCole Faust
2141*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
2142*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2143*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
2144*c217d954SCole Faust
2145*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
2146*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2147*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2148*c217d954SCole Faust
2149*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
2150*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2151*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2152*c217d954SCole Faust
2153*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
2154*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2155*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2156*c217d954SCole Faust
2157*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
2158*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2159*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2160*c217d954SCole Faust
2161*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
2162*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2163*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2164*c217d954SCole Faust
2165*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
2166*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2167*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2168*c217d954SCole Faust
2169*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
2170*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
2171*c217d954SCole Faust
2172*c217d954SCole Faust
2173*c217d954SCole Faust
2174*c217d954SCole Faust
2175*c217d954SCole Faust
2176*c217d954SCole Faust#define convert_float_sat convert_float
2177*c217d954SCole Faust#define convert_float1_sat convert_float
2178*c217d954SCole Faust#define convert_float2_sat convert_float2
2179*c217d954SCole Faust#define convert_float3_sat convert_float3
2180*c217d954SCole Faust#define convert_float4_sat convert_float4
2181*c217d954SCole Faust#define convert_float8_sat convert_float8
2182*c217d954SCole Faust#define convert_float16_sat convert_float16
2183*c217d954SCole Faust#define convert_half_sat convert_float
2184*c217d954SCole Faust#define convert_half1_sat convert_half
2185*c217d954SCole Faust#define convert_half2_sat convert_half2
2186*c217d954SCole Faust#define convert_half3_sat convert_half3
2187*c217d954SCole Faust#define convert_half4_sat convert_half4
2188*c217d954SCole Faust#define convert_half8_sat convert_half8
2189*c217d954SCole Faust#define convert_half16_sat convert_half16
2190*c217d954SCole Faust
2191*c217d954SCole Faust#define convert_float1 convert_float
2192*c217d954SCole Faust#define convert_half1 convert_half
2193*c217d954SCole Faust#define convert_char1 convert_char
2194*c217d954SCole Faust#define convert_uchar1 convert_uchar
2195*c217d954SCole Faust#define convert_short1 convert_short
2196*c217d954SCole Faust#define convert_ushort1 convert_ushort
2197*c217d954SCole Faust#define convert_int1 convert_int
2198*c217d954SCole Faust#define convert_uint1 convert_uint
2199*c217d954SCole Faust#define convert_long1 convert_long
2200*c217d954SCole Faust#define convert_ulong1 convert_ulong
2201*c217d954SCole Faust#define convert_double1 convert_double
2202*c217d954SCole Faust
2203*c217d954SCole Faust#define convert_char1_sat convert_char_sat
2204*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
2205*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
2206*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
2207*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
2208*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
2209*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
2210*c217d954SCole Faust#define convert_short1_sat convert_short_sat
2211*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
2212*c217d954SCole Faust#define convert_int1_sat convert_int_sat
2213*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
2214*c217d954SCole Faust#define convert_long1_sat convert_long_sat
2215*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
2216*c217d954SCole Faust#define convert_double1_sat convert_double_sat
2217*c217d954SCole Faust
2218*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
2219*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2220*c217d954SCole Faust
2221*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
2222*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
2223*c217d954SCole Faust
2224*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2225*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2226*c217d954SCole Faust
2227*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2228*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2229*c217d954SCole Faust
2230*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
2231*c217d954SCole Faust#define select_vec_dt_char(size) char##size
2232*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
2233*c217d954SCole Faust#define select_vec_dt_short(size) short##size
2234*c217d954SCole Faust#define select_vec_dt_half(size) short##size
2235*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
2236*c217d954SCole Faust#define select_vec_dt_int(size) int##size
2237*c217d954SCole Faust#define select_vec_dt_float(size) int##size
2238*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
2239*c217d954SCole Faust#define select_vec_dt_long(size) long##size
2240*c217d954SCole Faust
2241*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2242*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2243*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2244*c217d954SCole Faust
2245*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
2246*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
2247*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
2248*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
2249*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
2250*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
2251*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
2252*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
2253*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
2254*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
2255*c217d954SCole Faust
2256*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2257*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2258*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2259*c217d954SCole Faust
2260*c217d954SCole Faust#define sum_reduce_1(x) (x)
2261*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2262*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2263*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2264*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2265*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2266*c217d954SCole Faust
2267*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2268*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2269*c217d954SCole Faust
2270*c217d954SCole Faust#define prod_reduce_1(x) (x)
2271*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2272*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2273*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2274*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2275*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2276*c217d954SCole Faust
2277*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2278*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2279*c217d954SCole Faust
2280*c217d954SCole Faust#define max_reduce_1(x) (x)
2281*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
2282*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2283*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2284*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2285*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2286*c217d954SCole Faust
2287*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2288*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2289*c217d954SCole Faust
2290*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
2291*c217d954SCole Faust    __global uchar *name##_ptr,      \
2292*c217d954SCole Faust    uint        name##_stride_x, \
2293*c217d954SCole Faust    uint        name##_step_x,   \
2294*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2295*c217d954SCole Faust
2296*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
2297*c217d954SCole Faust    __global uchar *name##_ptr,      \
2298*c217d954SCole Faust    uint        name##_stride_x, \
2299*c217d954SCole Faust    uint        name##_step_x,   \
2300*c217d954SCole Faust    uint        name##_stride_y, \
2301*c217d954SCole Faust    uint        name##_step_y,   \
2302*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2303*c217d954SCole Faust
2304*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
2305*c217d954SCole Faust    __global uchar *name##_ptr,      \
2306*c217d954SCole Faust    uint        name##_stride_x, \
2307*c217d954SCole Faust    uint        name##_step_x,   \
2308*c217d954SCole Faust    uint        name##_stride_y, \
2309*c217d954SCole Faust    uint        name##_step_y,   \
2310*c217d954SCole Faust    uint        name##_stride_z, \
2311*c217d954SCole Faust    uint        name##_step_z,   \
2312*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2313*c217d954SCole Faust
2314*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
2315*c217d954SCole Faust    __global uchar *name##_ptr,      \
2316*c217d954SCole Faust    uint        name##_stride_x, \
2317*c217d954SCole Faust    uint        name##_step_x,   \
2318*c217d954SCole Faust    uint        name##_stride_y, \
2319*c217d954SCole Faust    uint        name##_step_y,   \
2320*c217d954SCole Faust    uint        name##_stride_z, \
2321*c217d954SCole Faust    uint        name##_step_z,   \
2322*c217d954SCole Faust    uint        name##_stride_w, \
2323*c217d954SCole Faust    uint        name##_step_w,   \
2324*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2325*c217d954SCole Faust
2326*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
2327*c217d954SCole Faust    __global uchar *name##_ptr,      \
2328*c217d954SCole Faust    uint        name##_stride_x, \
2329*c217d954SCole Faust    uint        name##_step_x,   \
2330*c217d954SCole Faust    uint        name##_stride_y, \
2331*c217d954SCole Faust    uint        name##_step_y,   \
2332*c217d954SCole Faust    uint        name##_stride_z, \
2333*c217d954SCole Faust    uint        name##_step_z,   \
2334*c217d954SCole Faust    uint        name##_stride_w, \
2335*c217d954SCole Faust    uint        name##_step_w,   \
2336*c217d954SCole Faust    uint        name##_stride_v, \
2337*c217d954SCole Faust    uint        name##_step_v,   \
2338*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2339*c217d954SCole Faust
2340*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
2341*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2342*c217d954SCole Faust
2343*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2344*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2345*c217d954SCole Faust
2346*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
2347*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2348*c217d954SCole Faust
2349*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2350*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2351*c217d954SCole Faust
2352*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2353*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2354*c217d954SCole Faust
2355*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2356*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2357*c217d954SCole Faust
2358*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2359*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2360*c217d954SCole Faust
2361*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2362*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2363*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
2364*c217d954SCole Faust
2365*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2366*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2367*c217d954SCole Faust
2368*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2369*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2370*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2371*c217d954SCole Faust
2372*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2373*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2374*c217d954SCole Faust
2375*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2376*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2377*c217d954SCole Faust                           name##_stride_z, name##_step_z)
2378*c217d954SCole Faust
2379*c217d954SCole Faust
2380*c217d954SCole Fausttypedef struct Vector
2381*c217d954SCole Faust{
2382*c217d954SCole Faust    __global uchar *ptr;
2383*c217d954SCole Faust    int             offset_first_element_in_bytes;
2384*c217d954SCole Faust    int             stride_x;
2385*c217d954SCole Faust} Vector;
2386*c217d954SCole Faust
2387*c217d954SCole Faust
2388*c217d954SCole Fausttypedef struct Image
2389*c217d954SCole Faust{
2390*c217d954SCole Faust    __global uchar *ptr;
2391*c217d954SCole Faust    int             offset_first_element_in_bytes;
2392*c217d954SCole Faust    int             stride_x;
2393*c217d954SCole Faust    int             stride_y;
2394*c217d954SCole Faust} Image;
2395*c217d954SCole Faust
2396*c217d954SCole Faust
2397*c217d954SCole Fausttypedef struct Tensor3D
2398*c217d954SCole Faust{
2399*c217d954SCole Faust    __global uchar *ptr;
2400*c217d954SCole Faust    int             offset_first_element_in_bytes;
2401*c217d954SCole Faust    int             stride_x;
2402*c217d954SCole Faust    int             stride_y;
2403*c217d954SCole Faust    int             stride_z;
2404*c217d954SCole Faust} Tensor3D;
2405*c217d954SCole Faust
2406*c217d954SCole Faust
2407*c217d954SCole Fausttypedef struct Tensor4D
2408*c217d954SCole Faust{
2409*c217d954SCole Faust    __global uchar *ptr;
2410*c217d954SCole Faust    int             offset_first_element_in_bytes;
2411*c217d954SCole Faust    int             stride_x;
2412*c217d954SCole Faust    int             stride_y;
2413*c217d954SCole Faust    int             stride_z;
2414*c217d954SCole Faust    int             stride_w;
2415*c217d954SCole Faust} Tensor4D;
2416*c217d954SCole Faust
2417*c217d954SCole Faust
2418*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2419*c217d954SCole Faust{
2420*c217d954SCole Faust    Vector vector =
2421*c217d954SCole Faust    {
2422*c217d954SCole Faust        .ptr                           = ptr,
2423*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2424*c217d954SCole Faust        .stride_x                      = stride_x,
2425*c217d954SCole Faust    };
2426*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2427*c217d954SCole Faust    return vector;
2428*c217d954SCole Faust}
2429*c217d954SCole Faust
2430*c217d954SCole Faust
2431*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2432*c217d954SCole Faust{
2433*c217d954SCole Faust    Image img =
2434*c217d954SCole Faust    {
2435*c217d954SCole Faust        .ptr                           = ptr,
2436*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2437*c217d954SCole Faust        .stride_x                      = stride_x,
2438*c217d954SCole Faust        .stride_y                      = stride_y
2439*c217d954SCole Faust    };
2440*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2441*c217d954SCole Faust    return img;
2442*c217d954SCole Faust}
2443*c217d954SCole Faust
2444*c217d954SCole Faust
2445*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2446*c217d954SCole Faust{
2447*c217d954SCole Faust    Image img =
2448*c217d954SCole Faust    {
2449*c217d954SCole Faust        .ptr                           = ptr,
2450*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2451*c217d954SCole Faust        .stride_x                      = stride_x,
2452*c217d954SCole Faust        .stride_y                      = stride_y
2453*c217d954SCole Faust    };
2454*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2455*c217d954SCole Faust    return img;
2456*c217d954SCole Faust}
2457*c217d954SCole Faust
2458*c217d954SCole Faust
2459*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2460*c217d954SCole Faust{
2461*c217d954SCole Faust    Tensor3D tensor =
2462*c217d954SCole Faust    {
2463*c217d954SCole Faust        .ptr                           = ptr,
2464*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2465*c217d954SCole Faust        .stride_x                      = stride_x,
2466*c217d954SCole Faust        .stride_y                      = stride_y,
2467*c217d954SCole Faust        .stride_z                      = stride_z
2468*c217d954SCole Faust    };
2469*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2470*c217d954SCole Faust    return tensor;
2471*c217d954SCole Faust}
2472*c217d954SCole Faust
2473*c217d954SCole Faust
2474*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2475*c217d954SCole Faust{
2476*c217d954SCole Faust    Tensor3D tensor =
2477*c217d954SCole Faust    {
2478*c217d954SCole Faust        .ptr                           = ptr,
2479*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2480*c217d954SCole Faust        .stride_x                      = stride_x,
2481*c217d954SCole Faust        .stride_y                      = stride_y,
2482*c217d954SCole Faust        .stride_z                      = stride_z
2483*c217d954SCole Faust    };
2484*c217d954SCole Faust    return tensor;
2485*c217d954SCole Faust}
2486*c217d954SCole Faust
2487*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2488*c217d954SCole Faust                                             uint step_w,
2489*c217d954SCole Faust                                             uint mod_size)
2490*c217d954SCole Faust{
2491*c217d954SCole Faust    Tensor4D tensor =
2492*c217d954SCole Faust    {
2493*c217d954SCole Faust        .ptr                           = ptr,
2494*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2495*c217d954SCole Faust        .stride_x                      = stride_x,
2496*c217d954SCole Faust        .stride_y                      = stride_y,
2497*c217d954SCole Faust        .stride_z                      = stride_z,
2498*c217d954SCole Faust        .stride_w                      = stride_w
2499*c217d954SCole Faust    };
2500*c217d954SCole Faust
2501*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2502*c217d954SCole Faust    return tensor;
2503*c217d954SCole Faust}
2504*c217d954SCole Faust
2505*c217d954SCole Faust
2506*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
2507*c217d954SCole Faust{
2508*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
2509*c217d954SCole Faust}
2510*c217d954SCole Faust
2511*c217d954SCole Faust
2512*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
2513*c217d954SCole Faust{
2514*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
2515*c217d954SCole Faust}
2516*c217d954SCole Faust
2517*c217d954SCole Faust
2518*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2519*c217d954SCole Faust{
2520*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2521*c217d954SCole Faust}
2522*c217d954SCole Faust
2523*c217d954SCole Faust
2524*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2525*c217d954SCole Faust{
2526*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2527*c217d954SCole Faust}
2528*c217d954SCole Faust
2529*c217d954SCole Faust
2530*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2531*c217d954SCole Faust{
2532*c217d954SCole Faust    uint num_elements = width * height;
2533*c217d954SCole Faust
2534*c217d954SCole Faust    const uint z = index / num_elements;
2535*c217d954SCole Faust
2536*c217d954SCole Faust    index %= num_elements;
2537*c217d954SCole Faust
2538*c217d954SCole Faust    const uint y = index / width;
2539*c217d954SCole Faust
2540*c217d954SCole Faust    index %= width;
2541*c217d954SCole Faust
2542*c217d954SCole Faust    const uint x = index;
2543*c217d954SCole Faust
2544*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2545*c217d954SCole Faust}
2546*c217d954SCole Faust
2547*c217d954SCole Faust#endif
2548*c217d954SCole Faust
2549*c217d954SCole Faust
2550*c217d954SCole Faust#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
2551*c217d954SCole Faust#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
2552*c217d954SCole Faust
2553*c217d954SCole Faust
2554*c217d954SCole Faust#define scalar_access_0_1(x) ((x).s0)
2555*c217d954SCole Faust#define scalar_access_0_2(x) ((x).s01)
2556*c217d954SCole Faust#define scalar_access_0_3(x) ((x).s012)
2557*c217d954SCole Faust#define scalar_access_0_4(x) ((x).s0123)
2558*c217d954SCole Faust#define scalar_access_0_8(x) ((x).s01234567)
2559*c217d954SCole Faust#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
2560*c217d954SCole Faust
2561*c217d954SCole Faust
2562*c217d954SCole Faust#define scalar_access_1_1(x) ((x).s1)
2563*c217d954SCole Faust#define scalar_access_1_2(x) ((x).s12)
2564*c217d954SCole Faust#define scalar_access_1_3(x) ((x).s123)
2565*c217d954SCole Faust#define scalar_access_1_4(x) ((x).s1234)
2566*c217d954SCole Faust#define scalar_access_1_8(x) ((x).s12345678)
2567*c217d954SCole Faust
2568*c217d954SCole Faust
2569*c217d954SCole Faust#define scalar_access_2_1(x) ((x).s2)
2570*c217d954SCole Faust#define scalar_access_2_2(x) ((x).s23)
2571*c217d954SCole Faust#define scalar_access_2_3(x) ((x).s234)
2572*c217d954SCole Faust#define scalar_access_2_4(x) ((x).s2345)
2573*c217d954SCole Faust#define scalar_access_2_8(x) ((x).s23456789)
2574*c217d954SCole Faust
2575*c217d954SCole Faust
2576*c217d954SCole Faust#define scalar_access_3_1(x) ((x).s3)
2577*c217d954SCole Faust#define scalar_access_3_2(x) ((x).s34)
2578*c217d954SCole Faust#define scalar_access_3_3(x) ((x).s345)
2579*c217d954SCole Faust#define scalar_access_3_4(x) ((x).s3456)
2580*c217d954SCole Faust#define scalar_access_3_8(x) ((x).s3456789A)
2581*c217d954SCole Faust
2582*c217d954SCole Faust
2583*c217d954SCole Faust#define scalar_access_4_1(x) ((x).s4)
2584*c217d954SCole Faust#define scalar_access_4_2(x) ((x).s45)
2585*c217d954SCole Faust#define scalar_access_4_3(x) ((x).s456)
2586*c217d954SCole Faust#define scalar_access_4_4(x) ((x).s4567)
2587*c217d954SCole Faust#define scalar_access_4_8(x) ((x).s456789AB)
2588*c217d954SCole Faust
2589*c217d954SCole Faust
2590*c217d954SCole Faust#define scalar_access_8_1(x) ((x).s8)
2591*c217d954SCole Faust#define scalar_access_8_2(x) ((x).s89)
2592*c217d954SCole Faust#define scalar_access_8_3(x) ((x).s89A)
2593*c217d954SCole Faust#define scalar_access_8_4(x) ((x).s89AB)
2594*c217d954SCole Faust#define scalar_access_8_8(x) ((x).s89ABCDEF)
2595*c217d954SCole Faust
2596*c217d954SCole Faust
2597*c217d954SCole Faust#define scalar_access_12_1(x) ((x).sC)
2598*c217d954SCole Faust#define scalar_access_12_2(x) ((x).sCD)
2599*c217d954SCole Faust#define scalar_access_12_3(x) ((x).sCDE)
2600*c217d954SCole Faust#define scalar_access_12_4(x) ((x).sCDEF)
2601*c217d954SCole Faust
2602*c217d954SCole Faust
2603*c217d954SCole Faust#define scalar_access_16_1(x) ((x).sF)
2604*c217d954SCole Faust
2605*c217d954SCole Faust
2606*c217d954SCole Faust#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2607*c217d954SCole Faust    ({})
2608*c217d954SCole Faust
2609*c217d954SCole Faust#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2610*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2611*c217d954SCole Faust
2612*c217d954SCole Faust#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2613*c217d954SCole Faust    LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2614*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2615*c217d954SCole Faust
2616*c217d954SCole Faust#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2617*c217d954SCole Faust    LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2618*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2619*c217d954SCole Faust
2620*c217d954SCole Faust#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2621*c217d954SCole Faust    LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2622*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2623*c217d954SCole Faust
2624*c217d954SCole Faust#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2625*c217d954SCole Faust    LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2626*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2627*c217d954SCole Faust
2628*c217d954SCole Faust#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2629*c217d954SCole Faust    LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2630*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2631*c217d954SCole Faust
2632*c217d954SCole Faust#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2633*c217d954SCole Faust    LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2634*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2635*c217d954SCole Faust
2636*c217d954SCole Faust#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2637*c217d954SCole Faust    LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2638*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2639*c217d954SCole Faust
2640*c217d954SCole Faust#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2641*c217d954SCole Faust    LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2642*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2643*c217d954SCole Faust
2644*c217d954SCole Faust#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2645*c217d954SCole Faust    LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)      \
2646*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2647*c217d954SCole Faust
2648*c217d954SCole Faust#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2649*c217d954SCole Faust    LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2650*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2651*c217d954SCole Faust
2652*c217d954SCole Faust#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2653*c217d954SCole Faust    LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2654*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2655*c217d954SCole Faust
2656*c217d954SCole Faust#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2657*c217d954SCole Faust    LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2658*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2659*c217d954SCole Faust
2660*c217d954SCole Faust#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2661*c217d954SCole Faust    LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2662*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2663*c217d954SCole Faust
2664*c217d954SCole Faust#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2665*c217d954SCole Faust    LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2666*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2667*c217d954SCole Faust
2668*c217d954SCole Faust#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2669*c217d954SCole Faust    LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2670*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2671*c217d954SCole Faust
2672*c217d954SCole Faust
2673*c217d954SCole Faust
2674*c217d954SCole Faust#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2675*c217d954SCole Faust#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2676*c217d954SCole Faust
2677*c217d954SCole Faust
2678*c217d954SCole Faust
2679*c217d954SCole Faust#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2680*c217d954SCole Faust    ({})
2681*c217d954SCole Faust
2682*c217d954SCole Faust#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2683*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2684*c217d954SCole Faust
2685*c217d954SCole Faust#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2686*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2687*c217d954SCole Faust
2688*c217d954SCole Faust#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2689*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2690*c217d954SCole Faust
2691*c217d954SCole Faust#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2692*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2693*c217d954SCole Faust
2694*c217d954SCole Faust#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2695*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2696*c217d954SCole Faust    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2697*c217d954SCole Faust
2698*c217d954SCole Faust#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2699*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2700*c217d954SCole Faust    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2701*c217d954SCole Faust
2702*c217d954SCole Faust#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2703*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2704*c217d954SCole Faust    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2705*c217d954SCole Faust
2706*c217d954SCole Faust#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2707*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2708*c217d954SCole Faust
2709*c217d954SCole Faust#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2710*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);        \
2711*c217d954SCole Faust    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2712*c217d954SCole Faust
2713*c217d954SCole Faust#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2714*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2715*c217d954SCole Faust    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2716*c217d954SCole Faust
2717*c217d954SCole Faust#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2718*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2719*c217d954SCole Faust    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2720*c217d954SCole Faust
2721*c217d954SCole Faust#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2722*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2723*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2724*c217d954SCole Faust
2725*c217d954SCole Faust#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2726*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2727*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2728*c217d954SCole Faust    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2729*c217d954SCole Faust
2730*c217d954SCole Faust#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2731*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);                          \
2732*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2733*c217d954SCole Faust    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2734*c217d954SCole Faust
2735*c217d954SCole Faust#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2736*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2737*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2738*c217d954SCole Faust    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2739*c217d954SCole Faust
2740*c217d954SCole Faust#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2741*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2742*c217d954SCole Faust
2743*c217d954SCole Faust
2744*c217d954SCole Faust
2745*c217d954SCole Faust#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2746*c217d954SCole Faust#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2747*c217d954SCole Faust
2748*c217d954SCole Faust
2749*c217d954SCole Faust#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2750*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2751*c217d954SCole Faust    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2752*c217d954SCole Faust
2753*c217d954SCole Faust#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2754*c217d954SCole Faust    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2755*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2756*c217d954SCole Faust    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2757*c217d954SCole Faust
2758*c217d954SCole Faust#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2759*c217d954SCole Faust    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2760*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2761*c217d954SCole Faust    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2762*c217d954SCole Faust
2763*c217d954SCole Faust#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2764*c217d954SCole Faust    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2765*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2766*c217d954SCole Faust    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2767*c217d954SCole Faust
2768*c217d954SCole Faust#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2769*c217d954SCole Faust    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2770*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2771*c217d954SCole Faust    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2772*c217d954SCole Faust
2773*c217d954SCole Faust#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2774*c217d954SCole Faust    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2775*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2776*c217d954SCole Faust    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2777*c217d954SCole Faust
2778*c217d954SCole Faust#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2779*c217d954SCole Faust    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2780*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2781*c217d954SCole Faust    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2782*c217d954SCole Faust
2783*c217d954SCole Faust#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2784*c217d954SCole Faust    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2785*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2786*c217d954SCole Faust    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2787*c217d954SCole Faust
2788*c217d954SCole Faust#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2789*c217d954SCole Faust    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2790*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2791*c217d954SCole Faust    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2792*c217d954SCole Faust
2793*c217d954SCole Faust#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2794*c217d954SCole Faust    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2795*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2796*c217d954SCole Faust    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2797*c217d954SCole Faust
2798*c217d954SCole Faust#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2799*c217d954SCole Faust    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2800*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2801*c217d954SCole Faust    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2802*c217d954SCole Faust
2803*c217d954SCole Faust#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2804*c217d954SCole Faust    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2805*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2806*c217d954SCole Faust    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2807*c217d954SCole Faust
2808*c217d954SCole Faust#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2809*c217d954SCole Faust    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2810*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2811*c217d954SCole Faust    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2812*c217d954SCole Faust
2813*c217d954SCole Faust#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2814*c217d954SCole Faust    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2815*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2816*c217d954SCole Faust    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2817*c217d954SCole Faust
2818*c217d954SCole Faust#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2819*c217d954SCole Faust    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2820*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2821*c217d954SCole Faust    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2822*c217d954SCole Faust
2823*c217d954SCole Faust#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2824*c217d954SCole Faust    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2825*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2826*c217d954SCole Faust    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2827*c217d954SCole Faust
2828*c217d954SCole Faust
2829*c217d954SCole Faust
2830*c217d954SCole Faust
2831*c217d954SCole Faust#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2832*c217d954SCole Faust#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2833*c217d954SCole Faust
2834*c217d954SCole Faust
2835*c217d954SCole Faust
2836*c217d954SCole Faust#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2837*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2838*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2839*c217d954SCole Faust
2840*c217d954SCole Faust#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2841*c217d954SCole Faust    LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2842*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2843*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2844*c217d954SCole Faust
2845*c217d954SCole Faust#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2846*c217d954SCole Faust    LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2847*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2848*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2849*c217d954SCole Faust
2850*c217d954SCole Faust#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2851*c217d954SCole Faust    LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2852*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2853*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2854*c217d954SCole Faust
2855*c217d954SCole Faust#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2856*c217d954SCole Faust    LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2857*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2858*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2859*c217d954SCole Faust
2860*c217d954SCole Faust#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2861*c217d954SCole Faust    LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2862*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2863*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2864*c217d954SCole Faust
2865*c217d954SCole Faust#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2866*c217d954SCole Faust    LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2867*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2868*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2869*c217d954SCole Faust
2870*c217d954SCole Faust#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2871*c217d954SCole Faust    LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2872*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2873*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2874*c217d954SCole Faust
2875*c217d954SCole Faust#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2876*c217d954SCole Faust    LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2877*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2878*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2879*c217d954SCole Faust
2880*c217d954SCole Faust#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2881*c217d954SCole Faust    LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2882*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2883*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2884*c217d954SCole Faust
2885*c217d954SCole Faust#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2886*c217d954SCole Faust    LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2887*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2888*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2889*c217d954SCole Faust
2890*c217d954SCole Faust#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2891*c217d954SCole Faust    LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2892*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2893*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2894*c217d954SCole Faust
2895*c217d954SCole Faust#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2896*c217d954SCole Faust    LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2897*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2898*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2899*c217d954SCole Faust
2900*c217d954SCole Faust#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2901*c217d954SCole Faust    LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2902*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2903*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2904*c217d954SCole Faust
2905*c217d954SCole Faust#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2906*c217d954SCole Faust    LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2907*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2908*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2909*c217d954SCole Faust
2910*c217d954SCole Faust#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2911*c217d954SCole Faust    LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2912*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2913*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2914*c217d954SCole Faust
2915*c217d954SCole Faust
2916*c217d954SCole Faust
2917*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2918*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2919*c217d954SCole Faust
2920*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2921*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
2922*c217d954SCole Faust    {                                                                                                                                                            \
2923*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
2924*c217d954SCole Faust    }                                                                                                                                                            \
2925*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
2926*c217d954SCole Faust    {                                                                                                                                                            \
2927*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2928*c217d954SCole Faust    }                                                                                                                                                            \
2929*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
2930*c217d954SCole Faust    {                                                                                                                                                            \
2931*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2932*c217d954SCole Faust    }                                                                                                                                                            \
2933*c217d954SCole Faust    else                                                                                                                                                         \
2934*c217d954SCole Faust    {                                                                                                                                                            \
2935*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
2936*c217d954SCole Faust    }
2937*c217d954SCole Faust
2938*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
2939*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                                \
2940*c217d954SCole Faust    {                                                                                                                    \
2941*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2942*c217d954SCole Faust    }                                                                                                                    \
2943*c217d954SCole Faust    else                                                                                                                 \
2944*c217d954SCole Faust    {                                                                                                                    \
2945*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2946*c217d954SCole Faust    }
2947*c217d954SCole Faust
2948*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
2949*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                                \
2950*c217d954SCole Faust    {                                                                                                                    \
2951*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2952*c217d954SCole Faust    }                                                                                                                    \
2953*c217d954SCole Faust    else                                                                                                                 \
2954*c217d954SCole Faust    {                                                                                                                    \
2955*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2956*c217d954SCole Faust    }
2957*c217d954SCole Faust
2958*c217d954SCole Faust
2959*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2960*c217d954SCole Faust
2961*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2962*c217d954SCole Faust    LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2963*c217d954SCole Faust
2964*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2965*c217d954SCole Faust
2966*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2967*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2968*c217d954SCole Faust    LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2969*c217d954SCole Faust
2970*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2971*c217d954SCole Faust
2972*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2973*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2974*c217d954SCole Faust    LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2975*c217d954SCole Faust
2976*c217d954SCole Faust#else
2977*c217d954SCole Faust
2978*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2979*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2980*c217d954SCole Faust    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2981*c217d954SCole Faust
2982*c217d954SCole Faust#endif
2983*c217d954SCole Faust
2984*c217d954SCole Faust
2985*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2986*c217d954SCole Faust    BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
2987*c217d954SCole Faust
2988*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2989*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2990*c217d954SCole Faust    BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
2991*c217d954SCole Faust
2992*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2993*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2994*c217d954SCole Faust    BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
2995*c217d954SCole Faust
2996*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2997*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2998*c217d954SCole Faust    BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
2999*c217d954SCole Faust
3000*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3001*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3002*c217d954SCole Faust    BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
3003*c217d954SCole Faust
3004*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3005*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3006*c217d954SCole Faust    BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
3007*c217d954SCole Faust
3008*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3009*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3010*c217d954SCole Faust    BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
3011*c217d954SCole Faust
3012*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3013*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3014*c217d954SCole Faust    BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
3015*c217d954SCole Faust
3016*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3017*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3018*c217d954SCole Faust    BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
3019*c217d954SCole Faust
3020*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3021*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)      \
3022*c217d954SCole Faust    BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
3023*c217d954SCole Faust
3024*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3025*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3026*c217d954SCole Faust    BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
3027*c217d954SCole Faust
3028*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3029*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3030*c217d954SCole Faust    BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
3031*c217d954SCole Faust
3032*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3033*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3034*c217d954SCole Faust    BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
3035*c217d954SCole Faust
3036*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3037*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3038*c217d954SCole Faust    BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
3039*c217d954SCole Faust
3040*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3041*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3042*c217d954SCole Faust    BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
3043*c217d954SCole Faust
3044*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3045*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3046*c217d954SCole Faust    BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
3047*c217d954SCole Faust
3048*c217d954SCole Faust
3049*c217d954SCole Faust
3050*c217d954SCole Faust#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3051*c217d954SCole Faust#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3052*c217d954SCole Faust
3053*c217d954SCole Faust
3054*c217d954SCole Faust
3055*c217d954SCole Faust#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3056*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3057*c217d954SCole Faust    BASENAME##0;                                                                            \
3058*c217d954SCole Faust    if(Y_MASK##0 != 0)                                                                      \
3059*c217d954SCole Faust        BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
3060*c217d954SCole Faust    else                                                                                    \
3061*c217d954SCole Faust        BASENAME##0 = 0;
3062*c217d954SCole Faust
3063*c217d954SCole Faust#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3064*c217d954SCole Faust    LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3065*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3066*c217d954SCole Faust    BASENAME##1;                                                                            \
3067*c217d954SCole Faust    if(Y_MASK##1 != 0)                                                                      \
3068*c217d954SCole Faust        BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
3069*c217d954SCole Faust    else                                                                                    \
3070*c217d954SCole Faust        BASENAME##1 = 0;
3071*c217d954SCole Faust
3072*c217d954SCole Faust#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3073*c217d954SCole Faust    LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3074*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3075*c217d954SCole Faust    BASENAME##2;                                                                            \
3076*c217d954SCole Faust    if(Y_MASK##2 != 0)                                                                      \
3077*c217d954SCole Faust        BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
3078*c217d954SCole Faust    else                                                                                    \
3079*c217d954SCole Faust        BASENAME##2 = 0;
3080*c217d954SCole Faust
3081*c217d954SCole Faust#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3082*c217d954SCole Faust    LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3083*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3084*c217d954SCole Faust    BASENAME##3;                                                                            \
3085*c217d954SCole Faust    if(Y_MASK##3 != 0)                                                                      \
3086*c217d954SCole Faust        BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
3087*c217d954SCole Faust    else                                                                                    \
3088*c217d954SCole Faust        BASENAME##3 = 0;
3089*c217d954SCole Faust
3090*c217d954SCole Faust#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3091*c217d954SCole Faust    LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3092*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3093*c217d954SCole Faust    BASENAME##4;                                                                            \
3094*c217d954SCole Faust    if(Y_MASK##4 != 0)                                                                      \
3095*c217d954SCole Faust        BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
3096*c217d954SCole Faust    else                                                                                    \
3097*c217d954SCole Faust        BASENAME##4 = 0;
3098*c217d954SCole Faust
3099*c217d954SCole Faust#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3100*c217d954SCole Faust    LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3101*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3102*c217d954SCole Faust    BASENAME##5;                                                                            \
3103*c217d954SCole Faust    if(Y_MASK##5 != 0)                                                                      \
3104*c217d954SCole Faust        BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
3105*c217d954SCole Faust    else                                                                                    \
3106*c217d954SCole Faust        BASENAME##5 = 0;
3107*c217d954SCole Faust
3108*c217d954SCole Faust#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3109*c217d954SCole Faust    LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3110*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3111*c217d954SCole Faust    BASENAME##6;                                                                            \
3112*c217d954SCole Faust    if(Y_MASK##6 != 0)                                                                      \
3113*c217d954SCole Faust        BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
3114*c217d954SCole Faust    else                                                                                    \
3115*c217d954SCole Faust        BASENAME##6 = 0;
3116*c217d954SCole Faust
3117*c217d954SCole Faust#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3118*c217d954SCole Faust    LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3119*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3120*c217d954SCole Faust    BASENAME##7;                                                                            \
3121*c217d954SCole Faust    if(Y_MASK##7 != 0)                                                                      \
3122*c217d954SCole Faust        BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
3123*c217d954SCole Faust    else                                                                                    \
3124*c217d954SCole Faust        BASENAME##7 = 0;
3125*c217d954SCole Faust
3126*c217d954SCole Faust#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3127*c217d954SCole Faust    LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3128*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3129*c217d954SCole Faust    BASENAME##8;                                                                            \
3130*c217d954SCole Faust    if(Y_MASK##8 != 0)                                                                      \
3131*c217d954SCole Faust        BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
3132*c217d954SCole Faust    else                                                                                    \
3133*c217d954SCole Faust        BASENAME##8 = 0;
3134*c217d954SCole Faust
3135*c217d954SCole Faust#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3136*c217d954SCole Faust    LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3137*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3138*c217d954SCole Faust    BASENAME##9;                                                                            \
3139*c217d954SCole Faust    if(Y_MASK##9 != 0)                                                                      \
3140*c217d954SCole Faust        BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
3141*c217d954SCole Faust    else                                                                                    \
3142*c217d954SCole Faust        BASENAME##9 = 0;
3143*c217d954SCole Faust
3144*c217d954SCole Faust#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3145*c217d954SCole Faust    LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3146*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3147*c217d954SCole Faust    BASENAME##A;                                                                            \
3148*c217d954SCole Faust    if(Y_MASK##A != 0)                                                                      \
3149*c217d954SCole Faust        BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
3150*c217d954SCole Faust    else                                                                                    \
3151*c217d954SCole Faust        BASENAME##A = 0;
3152*c217d954SCole Faust
3153*c217d954SCole Faust#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3154*c217d954SCole Faust    LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3155*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3156*c217d954SCole Faust    BASENAME##B;                                                                            \
3157*c217d954SCole Faust    if(Y_MASK##B != 0)                                                                      \
3158*c217d954SCole Faust        BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
3159*c217d954SCole Faust    else                                                                                    \
3160*c217d954SCole Faust        BASENAME##B = 0;
3161*c217d954SCole Faust
3162*c217d954SCole Faust#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3163*c217d954SCole Faust    LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3164*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3165*c217d954SCole Faust    BASENAME##C;                                                                            \
3166*c217d954SCole Faust    if(Y_MASK##C != 0)                                                                      \
3167*c217d954SCole Faust        BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
3168*c217d954SCole Faust    else                                                                                    \
3169*c217d954SCole Faust        BASENAME##C = 0;
3170*c217d954SCole Faust
3171*c217d954SCole Faust#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3172*c217d954SCole Faust    LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3173*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3174*c217d954SCole Faust    BASENAME##D;                                                                            \
3175*c217d954SCole Faust    if(Y_MASK##D != 0)                                                                      \
3176*c217d954SCole Faust        BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
3177*c217d954SCole Faust    else                                                                                    \
3178*c217d954SCole Faust        BASENAME##D = 0;
3179*c217d954SCole Faust
3180*c217d954SCole Faust#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3181*c217d954SCole Faust    LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3182*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3183*c217d954SCole Faust    BASENAME##E;                                                                            \
3184*c217d954SCole Faust    if(Y_MASK##E != 0)                                                                      \
3185*c217d954SCole Faust        BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
3186*c217d954SCole Faust    else                                                                                    \
3187*c217d954SCole Faust        BASENAME##E = 0;
3188*c217d954SCole Faust
3189*c217d954SCole Faust#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3190*c217d954SCole Faust    LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3191*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3192*c217d954SCole Faust    BASENAME##F;                                                                            \
3193*c217d954SCole Faust    if(Y_MASK##F != 0)                                                                      \
3194*c217d954SCole Faust        BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
3195*c217d954SCole Faust    else                                                                                    \
3196*c217d954SCole Faust        BASENAME##F = 0;
3197*c217d954SCole Faust
3198*c217d954SCole Faust
3199*c217d954SCole Faust#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3200*c217d954SCole Faust#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3201*c217d954SCole Faust
3202*c217d954SCole Faust
3203*c217d954SCole Faust#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3204*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3205*c217d954SCole Faust    BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
3206*c217d954SCole Faust
3207*c217d954SCole Faust#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3208*c217d954SCole Faust    LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3209*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3210*c217d954SCole Faust    BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
3211*c217d954SCole Faust
3212*c217d954SCole Faust#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3213*c217d954SCole Faust    LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3214*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3215*c217d954SCole Faust    BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
3216*c217d954SCole Faust
3217*c217d954SCole Faust#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3218*c217d954SCole Faust    LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3219*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3220*c217d954SCole Faust    BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
3221*c217d954SCole Faust
3222*c217d954SCole Faust#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3223*c217d954SCole Faust    LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3224*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3225*c217d954SCole Faust    BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
3226*c217d954SCole Faust
3227*c217d954SCole Faust#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3228*c217d954SCole Faust    LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3229*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3230*c217d954SCole Faust    BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
3231*c217d954SCole Faust
3232*c217d954SCole Faust#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3233*c217d954SCole Faust    LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3234*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3235*c217d954SCole Faust    BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
3236*c217d954SCole Faust
3237*c217d954SCole Faust#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3238*c217d954SCole Faust    LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3239*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3240*c217d954SCole Faust    BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
3241*c217d954SCole Faust
3242*c217d954SCole Faust#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3243*c217d954SCole Faust    LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3244*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3245*c217d954SCole Faust    BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
3246*c217d954SCole Faust
3247*c217d954SCole Faust#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3248*c217d954SCole Faust    LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)      \
3249*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3250*c217d954SCole Faust    BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
3251*c217d954SCole Faust
3252*c217d954SCole Faust#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3253*c217d954SCole Faust    LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3254*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3255*c217d954SCole Faust    BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
3256*c217d954SCole Faust
3257*c217d954SCole Faust#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3258*c217d954SCole Faust    LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3259*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3260*c217d954SCole Faust    BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
3261*c217d954SCole Faust
3262*c217d954SCole Faust#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3263*c217d954SCole Faust    LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3264*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3265*c217d954SCole Faust    BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
3266*c217d954SCole Faust
3267*c217d954SCole Faust#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3268*c217d954SCole Faust    LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3269*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3270*c217d954SCole Faust    BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
3271*c217d954SCole Faust
3272*c217d954SCole Faust#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3273*c217d954SCole Faust    LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3274*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3275*c217d954SCole Faust    BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
3276*c217d954SCole Faust
3277*c217d954SCole Faust#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3278*c217d954SCole Faust    LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3279*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3280*c217d954SCole Faust    BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
3281*c217d954SCole Faust
3282*c217d954SCole Faust
3283*c217d954SCole Faust
3284*c217d954SCole Faust
3285*c217d954SCole Faust#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3286*c217d954SCole Faust#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3287*c217d954SCole Faust
3288*c217d954SCole Faust
3289*c217d954SCole Faust
3290*c217d954SCole Faust#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3291*c217d954SCole Faust    Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3292*c217d954SCole Faust    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
3293*c217d954SCole Faust    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
3294*c217d954SCole Faust
3295*c217d954SCole Faust#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3296*c217d954SCole Faust    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3297*c217d954SCole Faust    Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3298*c217d954SCole Faust    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
3299*c217d954SCole Faust    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
3300*c217d954SCole Faust
3301*c217d954SCole Faust#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3302*c217d954SCole Faust    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3303*c217d954SCole Faust    Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3304*c217d954SCole Faust    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
3305*c217d954SCole Faust    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
3306*c217d954SCole Faust
3307*c217d954SCole Faust#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3308*c217d954SCole Faust    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3309*c217d954SCole Faust    Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3310*c217d954SCole Faust    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
3311*c217d954SCole Faust    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
3312*c217d954SCole Faust
3313*c217d954SCole Faust#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3314*c217d954SCole Faust    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3315*c217d954SCole Faust    Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3316*c217d954SCole Faust    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
3317*c217d954SCole Faust    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
3318*c217d954SCole Faust
3319*c217d954SCole Faust#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3320*c217d954SCole Faust    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3321*c217d954SCole Faust    Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3322*c217d954SCole Faust    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
3323*c217d954SCole Faust    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
3324*c217d954SCole Faust
3325*c217d954SCole Faust#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3326*c217d954SCole Faust    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3327*c217d954SCole Faust    Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3328*c217d954SCole Faust    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
3329*c217d954SCole Faust    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
3330*c217d954SCole Faust
3331*c217d954SCole Faust#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3332*c217d954SCole Faust    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3333*c217d954SCole Faust    Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3334*c217d954SCole Faust    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
3335*c217d954SCole Faust    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
3336*c217d954SCole Faust
3337*c217d954SCole Faust
3338*c217d954SCole Faust
3339*c217d954SCole Faust
3340*c217d954SCole Faust#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3341*c217d954SCole Faust#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3342*c217d954SCole Faust
3343*c217d954SCole Faust
3344*c217d954SCole Faust
3345*c217d954SCole Faust#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
3346*c217d954SCole Faust    BASENAME##0 *= (DATA_TYPE)SCALE;
3347*c217d954SCole Faust
3348*c217d954SCole Faust#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
3349*c217d954SCole Faust    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
3350*c217d954SCole Faust    BASENAME##1 *= (DATA_TYPE)SCALE;
3351*c217d954SCole Faust
3352*c217d954SCole Faust#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
3353*c217d954SCole Faust    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
3354*c217d954SCole Faust    BASENAME##2 *= (DATA_TYPE)SCALE;
3355*c217d954SCole Faust
3356*c217d954SCole Faust#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
3357*c217d954SCole Faust    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
3358*c217d954SCole Faust    BASENAME##3 *= (DATA_TYPE)SCALE;
3359*c217d954SCole Faust
3360*c217d954SCole Faust#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
3361*c217d954SCole Faust    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
3362*c217d954SCole Faust    BASENAME##4 *= (DATA_TYPE)SCALE;
3363*c217d954SCole Faust
3364*c217d954SCole Faust#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
3365*c217d954SCole Faust    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
3366*c217d954SCole Faust    BASENAME##5 *= (DATA_TYPE)SCALE;
3367*c217d954SCole Faust
3368*c217d954SCole Faust#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
3369*c217d954SCole Faust    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
3370*c217d954SCole Faust    BASENAME##6 *= (DATA_TYPE)SCALE;
3371*c217d954SCole Faust
3372*c217d954SCole Faust#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
3373*c217d954SCole Faust    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
3374*c217d954SCole Faust    BASENAME##7 *= (DATA_TYPE)SCALE;
3375*c217d954SCole Faust
3376*c217d954SCole Faust#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
3377*c217d954SCole Faust    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
3378*c217d954SCole Faust    BASENAME##8 *= (DATA_TYPE)SCALE;
3379*c217d954SCole Faust
3380*c217d954SCole Faust#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
3381*c217d954SCole Faust    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
3382*c217d954SCole Faust    BASENAME##9 *= (DATA_TYPE)SCALE;
3383*c217d954SCole Faust
3384*c217d954SCole Faust#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
3385*c217d954SCole Faust    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
3386*c217d954SCole Faust    BASENAME##A *= (DATA_TYPE)SCALE;
3387*c217d954SCole Faust
3388*c217d954SCole Faust#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
3389*c217d954SCole Faust    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
3390*c217d954SCole Faust    BASENAME##B *= (DATA_TYPE)SCALE;
3391*c217d954SCole Faust
3392*c217d954SCole Faust#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
3393*c217d954SCole Faust    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
3394*c217d954SCole Faust    BASENAME##C *= (DATA_TYPE)SCALE;
3395*c217d954SCole Faust
3396*c217d954SCole Faust#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
3397*c217d954SCole Faust    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
3398*c217d954SCole Faust    BASENAME##D *= (DATA_TYPE)SCALE;
3399*c217d954SCole Faust
3400*c217d954SCole Faust#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
3401*c217d954SCole Faust    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
3402*c217d954SCole Faust    BASENAME##E *= (DATA_TYPE)SCALE;
3403*c217d954SCole Faust
3404*c217d954SCole Faust#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
3405*c217d954SCole Faust    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
3406*c217d954SCole Faust    BASENAME##F *= (DATA_TYPE)SCALE;
3407*c217d954SCole Faust
3408*c217d954SCole Faust
3409*c217d954SCole Faust
3410*c217d954SCole Faust#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
3411*c217d954SCole Faust#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
3412*c217d954SCole Faust
3413*c217d954SCole Faust
3414*c217d954SCole Faust
3415*c217d954SCole Faust#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
3416*c217d954SCole Faust    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
3417*c217d954SCole Faust#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
3418*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 2)                         \
3419*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
3420*c217d954SCole Faust#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
3421*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 3)                         \
3422*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
3423*c217d954SCole Faust#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
3424*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 4)                         \
3425*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
3426*c217d954SCole Faust#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
3427*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 8)                         \
3428*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
3429*c217d954SCole Faust#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
3430*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 16)                         \
3431*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
3432*c217d954SCole Faust
3433*c217d954SCole Faust
3434*c217d954SCole Faust
3435*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
3436*c217d954SCole Faust    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
3437*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
3438*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 2)                                \
3439*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
3440*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
3441*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 3)                                \
3442*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
3443*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
3444*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 4)                                \
3445*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
3446*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
3447*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 8)                                \
3448*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
3449*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
3450*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 16)                                \
3451*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
3452*c217d954SCole Faust
3453*c217d954SCole Faust
3454*c217d954SCole Faust
3455*c217d954SCole Faust#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
3456*c217d954SCole Faust    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
3457*c217d954SCole Faust#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
3458*c217d954SCole Faust    COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
3459*c217d954SCole Faust    COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
3460*c217d954SCole Faust#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
3461*c217d954SCole Faust    TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE);    \
3462*c217d954SCole Faust    COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
3463*c217d954SCole Faust#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
3464*c217d954SCole Faust    TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE);    \
3465*c217d954SCole Faust    COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
3466*c217d954SCole Faust#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
3467*c217d954SCole Faust    TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE);    \
3468*c217d954SCole Faust    COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE);  \
3469*c217d954SCole Faust    COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE);  \
3470*c217d954SCole Faust    COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE);  \
3471*c217d954SCole Faust    COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
3472*c217d954SCole Faust#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
3473*c217d954SCole Faust    TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE);     \
3474*c217d954SCole Faust    COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE);   \
3475*c217d954SCole Faust    COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE);   \
3476*c217d954SCole Faust    COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE);   \
3477*c217d954SCole Faust    COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE);   \
3478*c217d954SCole Faust    COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE);   \
3479*c217d954SCole Faust    COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE);   \
3480*c217d954SCole Faust    COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE);   \
3481*c217d954SCole Faust    COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
3482*c217d954SCole Faust
3483*c217d954SCole Faust
3484*c217d954SCole Faust
3485*c217d954SCole Faust
3486*c217d954SCole Faust#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
3487*c217d954SCole Faust    CONCAT(COLUMN_VECTOR, K0)                          \
3488*c217d954SCole Faust    (IDX_COL, BASENAME, BS, TYPE);
3489*c217d954SCole Faust
3490*c217d954SCole Faust
3491*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
3492*c217d954SCole Faust    CONCAT(COLUMN_VECTOR_SCALAR, K0)                          \
3493*c217d954SCole Faust    (IDX_COL, BASENAME, BS, TYPE);
3494*c217d954SCole Faust
3495*c217d954SCole Faust
3496*c217d954SCole Faust#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
3497*c217d954SCole Faust    CONCAT(TRANSPOSE_K0X, N0)                       \
3498*c217d954SCole Faust    (K0, BASENAME, BS, TYPE);
3499*c217d954SCole Faust
3500*c217d954SCole Faust
3501*c217d954SCole Faust#define ADD_ROW_1(BASENAME, BIAS) \
3502*c217d954SCole Faust    BASENAME##0 += BIAS##0;
3503*c217d954SCole Faust
3504*c217d954SCole Faust#define ADD_ROW_2(BASENAME, BIAS) \
3505*c217d954SCole Faust    ADD_ROW_1(BASENAME, BIAS)     \
3506*c217d954SCole Faust    BASENAME##1 += BIAS##1;
3507*c217d954SCole Faust
3508*c217d954SCole Faust#define ADD_ROW_3(BASENAME, BIAS) \
3509*c217d954SCole Faust    ADD_ROW_2(BASENAME, BIAS)     \
3510*c217d954SCole Faust    BASENAME##2 += BIAS##2;
3511*c217d954SCole Faust
3512*c217d954SCole Faust#define ADD_ROW_4(BASENAME, BIAS) \
3513*c217d954SCole Faust    ADD_ROW_3(BASENAME, BIAS)     \
3514*c217d954SCole Faust    BASENAME##3 += BIAS##3;
3515*c217d954SCole Faust
3516*c217d954SCole Faust#define ADD_ROW_5(BASENAME, BIAS) \
3517*c217d954SCole Faust    ADD_ROW_4(BASENAME, BIAS)     \
3518*c217d954SCole Faust    BASENAME##4 += BIAS##4;
3519*c217d954SCole Faust
3520*c217d954SCole Faust#define ADD_ROW_6(BASENAME, BIAS) \
3521*c217d954SCole Faust    ADD_ROW_5(BASENAME, BIAS)     \
3522*c217d954SCole Faust    BASENAME##5 += BIAS##5;
3523*c217d954SCole Faust
3524*c217d954SCole Faust#define ADD_ROW_7(BASENAME, BIAS) \
3525*c217d954SCole Faust    ADD_ROW_6(BASENAME, BIAS)     \
3526*c217d954SCole Faust    BASENAME##6 += BIAS##6;
3527*c217d954SCole Faust
3528*c217d954SCole Faust#define ADD_ROW_8(BASENAME, BIAS) \
3529*c217d954SCole Faust    ADD_ROW_7(BASENAME, BIAS)     \
3530*c217d954SCole Faust    BASENAME##7 += BIAS##7;
3531*c217d954SCole Faust
3532*c217d954SCole Faust#define ADD_ROW_9(BASENAME, BIAS) \
3533*c217d954SCole Faust    ADD_ROW_8(BASENAME, BIAS)     \
3534*c217d954SCole Faust    BASENAME##8 += BIAS##8;
3535*c217d954SCole Faust
3536*c217d954SCole Faust#define ADD_ROW_10(BASENAME, BIAS) \
3537*c217d954SCole Faust    ADD_ROW_9(BASENAME, BIAS)      \
3538*c217d954SCole Faust    BASENAME##9 += BIAS##9;
3539*c217d954SCole Faust
3540*c217d954SCole Faust#define ADD_ROW_11(BASENAME, BIAS) \
3541*c217d954SCole Faust    ADD_ROW_10(BASENAME, BIAS)     \
3542*c217d954SCole Faust    BASENAME##A += BIAS##A;
3543*c217d954SCole Faust
3544*c217d954SCole Faust#define ADD_ROW_12(BASENAME, BIAS) \
3545*c217d954SCole Faust    ADD_ROW_11(BASENAME, BIAS)     \
3546*c217d954SCole Faust    BASENAME##B += BIAS##B;
3547*c217d954SCole Faust
3548*c217d954SCole Faust#define ADD_ROW_13(BASENAME, BIAS) \
3549*c217d954SCole Faust    ADD_ROW_12(BASENAME, BIAS)     \
3550*c217d954SCole Faust    BASENAME##C += BIAS##C;
3551*c217d954SCole Faust
3552*c217d954SCole Faust#define ADD_ROW_14(BASENAME, BIAS) \
3553*c217d954SCole Faust    ADD_ROW_13(BASENAME, BIAS)     \
3554*c217d954SCole Faust    BASENAME##D += BIAS##D;
3555*c217d954SCole Faust
3556*c217d954SCole Faust#define ADD_ROW_15(BASENAME, BIAS) \
3557*c217d954SCole Faust    ADD_ROW_14(BASENAME, BIAS)     \
3558*c217d954SCole Faust    BASENAME##E += BIAS##E;
3559*c217d954SCole Faust
3560*c217d954SCole Faust#define ADD_ROW_16(BASENAME, BIAS) \
3561*c217d954SCole Faust    ADD_ROW_15(BASENAME, BIAS)     \
3562*c217d954SCole Faust    BASENAME##F += BIAS##F;
3563*c217d954SCole Faust
3564*c217d954SCole Faust
3565*c217d954SCole Faust
3566*c217d954SCole Faust
3567*c217d954SCole Faust#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
3568*c217d954SCole Faust#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
3569*c217d954SCole Faust
3570*c217d954SCole Faust
3571*c217d954SCole Faust
3572*c217d954SCole Faust#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
3573*c217d954SCole Faust    BASENAME##0 += BIAS;
3574*c217d954SCole Faust
3575*c217d954SCole Faust#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
3576*c217d954SCole Faust    ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
3577*c217d954SCole Faust    BASENAME##1 += BIAS;
3578*c217d954SCole Faust
3579*c217d954SCole Faust#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
3580*c217d954SCole Faust    ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
3581*c217d954SCole Faust    BASENAME##2 += BIAS;
3582*c217d954SCole Faust
3583*c217d954SCole Faust#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
3584*c217d954SCole Faust    ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
3585*c217d954SCole Faust    BASENAME##3 += BIAS;
3586*c217d954SCole Faust
3587*c217d954SCole Faust#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
3588*c217d954SCole Faust    ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
3589*c217d954SCole Faust    BASENAME##4 += BIAS;
3590*c217d954SCole Faust
3591*c217d954SCole Faust#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
3592*c217d954SCole Faust    ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
3593*c217d954SCole Faust    BASENAME##5 += BIAS;
3594*c217d954SCole Faust
3595*c217d954SCole Faust#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
3596*c217d954SCole Faust    ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
3597*c217d954SCole Faust    BASENAME##6 += BIAS;
3598*c217d954SCole Faust
3599*c217d954SCole Faust#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
3600*c217d954SCole Faust    ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
3601*c217d954SCole Faust    BASENAME##7 += BIAS;
3602*c217d954SCole Faust
3603*c217d954SCole Faust#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
3604*c217d954SCole Faust    ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
3605*c217d954SCole Faust    BASENAME##8 += BIAS;
3606*c217d954SCole Faust
3607*c217d954SCole Faust#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
3608*c217d954SCole Faust    ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
3609*c217d954SCole Faust    BASENAME##9 += BIAS;
3610*c217d954SCole Faust
3611*c217d954SCole Faust#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
3612*c217d954SCole Faust    ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
3613*c217d954SCole Faust    BASENAME##A += BIAS;
3614*c217d954SCole Faust
3615*c217d954SCole Faust#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
3616*c217d954SCole Faust    ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
3617*c217d954SCole Faust    BASENAME##B += BIAS;
3618*c217d954SCole Faust
3619*c217d954SCole Faust#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
3620*c217d954SCole Faust    ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
3621*c217d954SCole Faust    BASENAME##C += BIAS;
3622*c217d954SCole Faust
3623*c217d954SCole Faust#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
3624*c217d954SCole Faust    ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
3625*c217d954SCole Faust    BASENAME##D += BIAS;
3626*c217d954SCole Faust
3627*c217d954SCole Faust#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
3628*c217d954SCole Faust    ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
3629*c217d954SCole Faust    BASENAME##E += BIAS;
3630*c217d954SCole Faust
3631*c217d954SCole Faust#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
3632*c217d954SCole Faust    ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
3633*c217d954SCole Faust    BASENAME##F += BIAS;
3634*c217d954SCole Faust
3635*c217d954SCole Faust
3636*c217d954SCole Faust#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
3637*c217d954SCole Faust#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
3638*c217d954SCole Faust
3639*c217d954SCole Faust
3640*c217d954SCole Faust
3641*c217d954SCole Faust#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3642*c217d954SCole Faust    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
3643*c217d954SCole Faust
3644*c217d954SCole Faust#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3645*c217d954SCole Faust    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3646*c217d954SCole Faust    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
3647*c217d954SCole Faust
3648*c217d954SCole Faust#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3649*c217d954SCole Faust    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3650*c217d954SCole Faust    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
3651*c217d954SCole Faust
3652*c217d954SCole Faust#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3653*c217d954SCole Faust    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3654*c217d954SCole Faust    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
3655*c217d954SCole Faust
3656*c217d954SCole Faust#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3657*c217d954SCole Faust    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3658*c217d954SCole Faust    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
3659*c217d954SCole Faust
3660*c217d954SCole Faust#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3661*c217d954SCole Faust    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3662*c217d954SCole Faust    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
3663*c217d954SCole Faust
3664*c217d954SCole Faust#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3665*c217d954SCole Faust    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3666*c217d954SCole Faust    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
3667*c217d954SCole Faust
3668*c217d954SCole Faust#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3669*c217d954SCole Faust    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3670*c217d954SCole Faust    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
3671*c217d954SCole Faust
3672*c217d954SCole Faust#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3673*c217d954SCole Faust    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3674*c217d954SCole Faust    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
3675*c217d954SCole Faust
3676*c217d954SCole Faust#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3677*c217d954SCole Faust    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)      \
3678*c217d954SCole Faust    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
3679*c217d954SCole Faust
3680*c217d954SCole Faust#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3681*c217d954SCole Faust    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3682*c217d954SCole Faust    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
3683*c217d954SCole Faust
3684*c217d954SCole Faust#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3685*c217d954SCole Faust    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3686*c217d954SCole Faust    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
3687*c217d954SCole Faust
3688*c217d954SCole Faust#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3689*c217d954SCole Faust    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3690*c217d954SCole Faust    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
3691*c217d954SCole Faust
3692*c217d954SCole Faust#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3693*c217d954SCole Faust    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3694*c217d954SCole Faust    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
3695*c217d954SCole Faust
3696*c217d954SCole Faust#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3697*c217d954SCole Faust    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3698*c217d954SCole Faust    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
3699*c217d954SCole Faust
3700*c217d954SCole Faust#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3701*c217d954SCole Faust    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3702*c217d954SCole Faust    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
3703*c217d954SCole Faust
3704*c217d954SCole Faust
3705*c217d954SCole Faust
3706*c217d954SCole Faust#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3707*c217d954SCole Faust#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3708*c217d954SCole Faust
3709*c217d954SCole Faust
3710*c217d954SCole Faust
3711*c217d954SCole Faust#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3712*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3713*c217d954SCole Faust    BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
3714*c217d954SCole Faust
3715*c217d954SCole Faust#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3716*c217d954SCole Faust    CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3717*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3718*c217d954SCole Faust    BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
3719*c217d954SCole Faust
3720*c217d954SCole Faust#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3721*c217d954SCole Faust    CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3722*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3723*c217d954SCole Faust    BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
3724*c217d954SCole Faust
3725*c217d954SCole Faust#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3726*c217d954SCole Faust    CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3727*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3728*c217d954SCole Faust    BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
3729*c217d954SCole Faust
3730*c217d954SCole Faust#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3731*c217d954SCole Faust    CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3732*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3733*c217d954SCole Faust    BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
3734*c217d954SCole Faust
3735*c217d954SCole Faust#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3736*c217d954SCole Faust    CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3737*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3738*c217d954SCole Faust    BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
3739*c217d954SCole Faust
3740*c217d954SCole Faust#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3741*c217d954SCole Faust    CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3742*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3743*c217d954SCole Faust    BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
3744*c217d954SCole Faust
3745*c217d954SCole Faust#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3746*c217d954SCole Faust    CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3747*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3748*c217d954SCole Faust    BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
3749*c217d954SCole Faust
3750*c217d954SCole Faust#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3751*c217d954SCole Faust    CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3752*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3753*c217d954SCole Faust    BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
3754*c217d954SCole Faust
3755*c217d954SCole Faust#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3756*c217d954SCole Faust    CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
3757*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3758*c217d954SCole Faust    BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
3759*c217d954SCole Faust
3760*c217d954SCole Faust#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3761*c217d954SCole Faust    CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3762*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3763*c217d954SCole Faust    BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
3764*c217d954SCole Faust
3765*c217d954SCole Faust#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3766*c217d954SCole Faust    CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3767*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3768*c217d954SCole Faust    BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
3769*c217d954SCole Faust
3770*c217d954SCole Faust#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3771*c217d954SCole Faust    CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3772*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3773*c217d954SCole Faust    BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
3774*c217d954SCole Faust
3775*c217d954SCole Faust#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3776*c217d954SCole Faust    CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3777*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3778*c217d954SCole Faust    BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
3779*c217d954SCole Faust
3780*c217d954SCole Faust#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3781*c217d954SCole Faust    CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3782*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3783*c217d954SCole Faust    BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
3784*c217d954SCole Faust
3785*c217d954SCole Faust#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3786*c217d954SCole Faust    CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3787*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3788*c217d954SCole Faust    BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
3789*c217d954SCole Faust
3790*c217d954SCole Faust
3791*c217d954SCole Faust
3792*c217d954SCole Faust#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3793*c217d954SCole Faust#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3794*c217d954SCole Faust
3795*c217d954SCole Faust
3796*c217d954SCole Faust#ifndef ARM_COMPUTE_REPEAT_H
3797*c217d954SCole Faust#define ARM_COMPUTE_REPEAT_H
3798*c217d954SCole Faust
3799*c217d954SCole Faust
3800*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
3801*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
3802*c217d954SCole Faust
3803*c217d954SCole Faust
3804*c217d954SCole Faust
3805*c217d954SCole Faust
3806*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3807*c217d954SCole Faust    VSTORE(N0)                                                 \
3808*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3809*c217d954SCole Faust
3810*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3811*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3812*c217d954SCole Faust    VSTORE(N0)                                                 \
3813*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3814*c217d954SCole Faust
3815*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3816*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3817*c217d954SCole Faust    VSTORE(N0)                                                 \
3818*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3819*c217d954SCole Faust
3820*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3821*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3822*c217d954SCole Faust    VSTORE(N0)                                                 \
3823*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3824*c217d954SCole Faust
3825*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3826*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3827*c217d954SCole Faust    VSTORE(N0)                                                 \
3828*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3829*c217d954SCole Faust
3830*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3831*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3832*c217d954SCole Faust    VSTORE(N0)                                                 \
3833*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3834*c217d954SCole Faust
3835*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3836*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3837*c217d954SCole Faust    VSTORE(N0)                                                 \
3838*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3839*c217d954SCole Faust
3840*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3841*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3842*c217d954SCole Faust    VSTORE(N0)                                                 \
3843*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3844*c217d954SCole Faust
3845*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3846*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3847*c217d954SCole Faust    VSTORE(N0)                                                 \
3848*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3849*c217d954SCole Faust
3850*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3851*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
3852*c217d954SCole Faust    VSTORE(N0)                                                  \
3853*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3854*c217d954SCole Faust
3855*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3856*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3857*c217d954SCole Faust    VSTORE(N0)                                                  \
3858*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3859*c217d954SCole Faust
3860*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3861*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3862*c217d954SCole Faust    VSTORE(N0)                                                  \
3863*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3864*c217d954SCole Faust
3865*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3866*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3867*c217d954SCole Faust    VSTORE(N0)                                                  \
3868*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3869*c217d954SCole Faust
3870*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3871*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3872*c217d954SCole Faust    VSTORE(N0)                                                  \
3873*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3874*c217d954SCole Faust
3875*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3876*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3877*c217d954SCole Faust    VSTORE(N0)                                                  \
3878*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3879*c217d954SCole Faust
3880*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3881*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3882*c217d954SCole Faust    VSTORE(N0)                                                  \
3883*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3884*c217d954SCole Faust
3885*c217d954SCole Faust
3886*c217d954SCole Faust
3887*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3888*c217d954SCole Faust    VSTORE(N0)                                                         \
3889*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3890*c217d954SCole Faust
3891*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3892*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3893*c217d954SCole Faust    VSTORE(N0)                                                         \
3894*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3895*c217d954SCole Faust
3896*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3897*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3898*c217d954SCole Faust    VSTORE(N0)                                                         \
3899*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3900*c217d954SCole Faust
3901*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3902*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3903*c217d954SCole Faust    VSTORE(N0)                                                         \
3904*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3905*c217d954SCole Faust
3906*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3907*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3908*c217d954SCole Faust    VSTORE(N0)                                                         \
3909*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3910*c217d954SCole Faust
3911*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3912*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3913*c217d954SCole Faust    VSTORE(N0)                                                         \
3914*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3915*c217d954SCole Faust
3916*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3917*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3918*c217d954SCole Faust    VSTORE(N0)                                                         \
3919*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3920*c217d954SCole Faust
3921*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3922*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3923*c217d954SCole Faust    VSTORE(N0)                                                         \
3924*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3925*c217d954SCole Faust
3926*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3927*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3928*c217d954SCole Faust    VSTORE(N0)                                                         \
3929*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3930*c217d954SCole Faust
3931*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
3932*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3933*c217d954SCole Faust    VSTORE(N0)                                                     \
3934*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3935*c217d954SCole Faust
3936*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3937*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3938*c217d954SCole Faust    VSTORE(N0)                                                          \
3939*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3940*c217d954SCole Faust
3941*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3942*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3943*c217d954SCole Faust    VSTORE(N0)                                                          \
3944*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3945*c217d954SCole Faust
3946*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3947*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3948*c217d954SCole Faust    VSTORE(N0)                                                          \
3949*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3950*c217d954SCole Faust
3951*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3952*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3953*c217d954SCole Faust    VSTORE(N0)                                                          \
3954*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3955*c217d954SCole Faust
3956*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3957*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3958*c217d954SCole Faust    VSTORE(N0)                                                          \
3959*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3960*c217d954SCole Faust
3961*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3962*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3963*c217d954SCole Faust    VSTORE(N0)                                                          \
3964*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3965*c217d954SCole Faust
3966*c217d954SCole Faust
3967*c217d954SCole Faust
3968*c217d954SCole Faust
3969*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3970*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3971*c217d954SCole Faust
3972*c217d954SCole Faust
3973*c217d954SCole Faust
3974*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3975*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3976*c217d954SCole Faust
3977*c217d954SCole Faust
3978*c217d954SCole Faust
3979*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3980*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3981*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3982*c217d954SCole Faust
3983*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3984*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3985*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3986*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3987*c217d954SCole Faust
3988*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3989*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3990*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3991*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3992*c217d954SCole Faust
3993*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3994*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3995*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3996*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3997*c217d954SCole Faust
3998*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3999*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4000*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4001*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
4002*c217d954SCole Faust
4003*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4004*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4005*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4006*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
4007*c217d954SCole Faust
4008*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4009*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4010*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4011*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
4012*c217d954SCole Faust
4013*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4014*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4015*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4016*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
4017*c217d954SCole Faust
4018*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4019*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4020*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4021*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
4022*c217d954SCole Faust
4023*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4024*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
4025*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4026*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
4027*c217d954SCole Faust
4028*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4029*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4030*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4031*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
4032*c217d954SCole Faust
4033*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4034*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4035*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4036*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
4037*c217d954SCole Faust
4038*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4039*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4040*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4041*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
4042*c217d954SCole Faust
4043*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4044*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4045*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4046*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
4047*c217d954SCole Faust
4048*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4049*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4050*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4051*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
4052*c217d954SCole Faust
4053*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4054*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4055*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4056*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
4057*c217d954SCole Faust
4058*c217d954SCole Faust
4059*c217d954SCole Faust
4060*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4061*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4062*c217d954SCole Faust
4063*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4064*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
4065*c217d954SCole Faust    {                                                                                                                                                     \
4066*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
4067*c217d954SCole Faust    }                                                                                                                                                     \
4068*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
4069*c217d954SCole Faust    {                                                                                                                                                     \
4070*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4071*c217d954SCole Faust    }                                                                                                                                                     \
4072*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
4073*c217d954SCole Faust    {                                                                                                                                                     \
4074*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4075*c217d954SCole Faust    }                                                                                                                                                     \
4076*c217d954SCole Faust    else                                                                                                                                                  \
4077*c217d954SCole Faust    {                                                                                                                                                     \
4078*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
4079*c217d954SCole Faust    }
4080*c217d954SCole Faust
4081*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
4082*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
4083*c217d954SCole Faust    {                                                                                                             \
4084*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4085*c217d954SCole Faust    }                                                                                                             \
4086*c217d954SCole Faust    else                                                                                                          \
4087*c217d954SCole Faust    {                                                                                                             \
4088*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4089*c217d954SCole Faust    }
4090*c217d954SCole Faust
4091*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
4092*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
4093*c217d954SCole Faust    {                                                                                                             \
4094*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4095*c217d954SCole Faust    }                                                                                                             \
4096*c217d954SCole Faust    else                                                                                                          \
4097*c217d954SCole Faust    {                                                                                                             \
4098*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4099*c217d954SCole Faust    }
4100*c217d954SCole Faust
4101*c217d954SCole Faust
4102*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
4103*c217d954SCole Faust
4104*c217d954SCole Faust
4105*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
4106*c217d954SCole Faust
4107*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4108*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4109*c217d954SCole Faust
4110*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
4111*c217d954SCole Faust
4112*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4113*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
4114*c217d954SCole Faust
4115*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
4116*c217d954SCole Faust
4117*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4118*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
4119*c217d954SCole Faust
4120*c217d954SCole Faust#else
4121*c217d954SCole Faust
4122*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4123*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
4124*c217d954SCole Faust
4125*c217d954SCole Faust#endif
4126*c217d954SCole Faust
4127*c217d954SCole Faust#endif
4128*c217d954SCole Faust
4129*c217d954SCole Faust
4130*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
4131*c217d954SCole Faust
4132*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4133*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
4134*c217d954SCole Faust#else
4135*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4136*c217d954SCole Faust    ((uint)(y * M0))
4137*c217d954SCole Faust#endif
4138*c217d954SCole Faust
4139*c217d954SCole Faust
4140*c217d954SCole Faust
4141*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
4142*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
4143*c217d954SCole Faust
4144*c217d954SCole Faust
4145*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4146*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
4147*c217d954SCole Faust#endif
4148*c217d954SCole Faust
4149*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
4150*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
4151*c217d954SCole Faust#endif
4152*c217d954SCole Faust
4153*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
4154*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
4155*c217d954SCole Faust#endif
4156*c217d954SCole Faust
4157*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
4158*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
4159*c217d954SCole Faust#endif
4160*c217d954SCole Faust
4161*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
4162*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
4163*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
4164*c217d954SCole Faust
4165*c217d954SCole Faust
4166*c217d954SCole Faust#define CONCAT(a, b) a##b
4167*c217d954SCole Faust
4168*c217d954SCole Faust
4169*c217d954SCole Faust#define EXPAND(x) x
4170*c217d954SCole Faust
4171*c217d954SCole Faust
4172*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
4173*c217d954SCole Faust
4174*c217d954SCole Faust
4175*c217d954SCole Faust#define REV1(x) ((x))
4176*c217d954SCole Faust#define REV2(x) ((x).s10)
4177*c217d954SCole Faust#define REV3(x) ((x).s210)
4178*c217d954SCole Faust#define REV4(x) ((x).s3210)
4179*c217d954SCole Faust#define REV8(x) ((x).s76543210)
4180*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
4181*c217d954SCole Faust
4182*c217d954SCole Faust
4183*c217d954SCole Faust
4184*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
4185*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
4186*c217d954SCole Faust
4187*c217d954SCole Faust
4188*c217d954SCole Faust
4189*c217d954SCole Faust#define ROT1_0(x) ((x))
4190*c217d954SCole Faust#define ROT1_1(x) ((x))
4191*c217d954SCole Faust
4192*c217d954SCole Faust#define ROT2_0(x) ((x))
4193*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
4194*c217d954SCole Faust#define ROT2_2(x) ((x))
4195*c217d954SCole Faust
4196*c217d954SCole Faust#define ROT3_0(x) ((x))
4197*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
4198*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
4199*c217d954SCole Faust#define ROT3_3(x) ((x))
4200*c217d954SCole Faust
4201*c217d954SCole Faust#define ROT4_0(x) ((x))
4202*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
4203*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
4204*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
4205*c217d954SCole Faust#define ROT4_4(x) ((x))
4206*c217d954SCole Faust
4207*c217d954SCole Faust#define ROT8_0(x) ((x))
4208*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
4209*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
4210*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
4211*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
4212*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
4213*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
4214*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
4215*c217d954SCole Faust#define ROT8_8(x) ((x))
4216*c217d954SCole Faust
4217*c217d954SCole Faust#define ROT16_0(x) ((x))
4218*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
4219*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
4220*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
4221*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
4222*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
4223*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
4224*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
4225*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
4226*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
4227*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
4228*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
4229*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
4230*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
4231*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
4232*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
4233*c217d954SCole Faust#define ROT16_16(x) ((x))
4234*c217d954SCole Faust
4235*c217d954SCole Faust
4236*c217d954SCole Faust
4237*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
4238*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
4239*c217d954SCole Faust
4240*c217d954SCole Faust
4241*c217d954SCole Faust
4242*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
4243*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
4244*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
4245*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
4246*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
4247*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
4248*c217d954SCole Faust
4249*c217d954SCole Faust
4250*c217d954SCole Faust
4251*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
4252*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
4253*c217d954SCole Faust
4254*c217d954SCole Faust
4255*c217d954SCole Faust#define VLOAD_STR(size) vload##size
4256*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
4257*c217d954SCole Faust
4258*c217d954SCole Faust
4259*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
4260*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
4261*c217d954SCole Faust
4262*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
4263*c217d954SCole Faust    {                            \
4264*c217d954SCole Faust    }
4265*c217d954SCole Faust
4266*c217d954SCole Faust
4267*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
4268*c217d954SCole Faust#define vload_partial_1_1 vload1
4269*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
4270*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
4271*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
4272*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
4273*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
4274*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
4275*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
4276*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
4277*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
4278*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
4279*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
4280*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
4281*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
4282*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
4283*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
4284*c217d954SCole Faust
4285*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
4286*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
4287*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
4288*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
4289*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
4290*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
4291*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
4292*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
4293*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
4294*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
4295*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
4296*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
4297*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
4298*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
4299*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
4300*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
4301*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
4302*c217d954SCole Faust
4303*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
4304*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
4305*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
4306*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
4307*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
4308*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
4309*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
4310*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
4311*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
4312*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
4313*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
4314*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
4315*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
4316*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
4317*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
4318*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
4319*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
4320*c217d954SCole Faust
4321*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
4322*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
4323*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
4324*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
4325*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
4326*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
4327*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
4328*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
4329*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
4330*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
4331*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
4332*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
4333*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
4334*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
4335*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
4336*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
4337*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
4338*c217d954SCole Faust
4339*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
4340*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
4341*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
4342*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
4343*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
4344*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
4345*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
4346*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
4347*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
4348*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
4349*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
4350*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
4351*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
4352*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
4353*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
4354*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
4355*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
4356*c217d954SCole Faust
4357*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
4358*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
4359*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
4360*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
4361*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
4362*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
4363*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
4364*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
4365*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
4366*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
4367*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
4368*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
4369*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
4370*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
4371*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
4372*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
4373*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
4374*c217d954SCole Faust
4375*c217d954SCole Faust
4376*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
4377*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
4378*c217d954SCole Faust
4379*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
4380*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
4381*c217d954SCole Faust
4382*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
4383*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
4384*c217d954SCole Faust
4385*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
4386*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
4387*c217d954SCole Faust
4388*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
4389*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4390*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
4391*c217d954SCole Faust
4392*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
4393*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4394*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
4395*c217d954SCole Faust
4396*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
4397*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4398*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
4399*c217d954SCole Faust
4400*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
4401*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
4402*c217d954SCole Faust
4403*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
4404*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4405*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
4406*c217d954SCole Faust
4407*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
4408*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4409*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
4410*c217d954SCole Faust
4411*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
4412*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4413*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
4414*c217d954SCole Faust
4415*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
4416*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4417*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
4418*c217d954SCole Faust
4419*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
4420*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4421*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
4422*c217d954SCole Faust
4423*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
4424*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4425*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
4426*c217d954SCole Faust
4427*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
4428*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4429*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
4430*c217d954SCole Faust
4431*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
4432*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
4433*c217d954SCole Faust
4434*c217d954SCole Faust
4435*c217d954SCole Faust
4436*c217d954SCole Faust#define PIXEL_UNIT4 1
4437*c217d954SCole Faust#define PIXEL_UNIT8 2
4438*c217d954SCole Faust#define PIXEL_UNIT16 4
4439*c217d954SCole Faust
4440*c217d954SCole Faust
4441*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
4442*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
4443*c217d954SCole Faust
4444*c217d954SCole Faust
4445*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
4446*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
4447*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
4448*c217d954SCole Faust
4449*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4450*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
4451*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
4452*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
4453*c217d954SCole Faust#endif
4454*c217d954SCole Faust
4455*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
4456*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
4457*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4458*c217d954SCole Faust
4459*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4460*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
4461*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
4462*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4463*c217d954SCole Faust#endif
4464*c217d954SCole Faust
4465*c217d954SCole Faust
4466*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
4467*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
4468*c217d954SCole Faust
4469*c217d954SCole Faust
4470*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
4471*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
4472*c217d954SCole Faust
4473*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
4474*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
4475*c217d954SCole Faust
4476*c217d954SCole Faust#define float1 float
4477*c217d954SCole Faust#define half1 half
4478*c217d954SCole Faust#define char1 char
4479*c217d954SCole Faust#define uchar1 uchar
4480*c217d954SCole Faust#define short1 short
4481*c217d954SCole Faust#define ushort1 ushort
4482*c217d954SCole Faust#define int1 int
4483*c217d954SCole Faust#define uint1 uint
4484*c217d954SCole Faust#define long1 long
4485*c217d954SCole Faust#define ulong1 ulong
4486*c217d954SCole Faust#define double1 double
4487*c217d954SCole Faust
4488*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
4489*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
4490*c217d954SCole Faust
4491*c217d954SCole Faust
4492*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
4493*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
4494*c217d954SCole Faust
4495*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
4496*c217d954SCole Faust    {                             \
4497*c217d954SCole Faust    }
4498*c217d954SCole Faust
4499*c217d954SCole Faust
4500*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
4501*c217d954SCole Faust#define vstore_partial_1_1 vstore1
4502*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
4503*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
4504*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
4505*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
4506*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
4507*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
4508*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
4509*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
4510*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
4511*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
4512*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
4513*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
4514*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
4515*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
4516*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
4517*c217d954SCole Faust
4518*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
4519*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
4520*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
4521*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
4522*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
4523*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
4524*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
4525*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
4526*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
4527*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
4528*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
4529*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
4530*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
4531*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
4532*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
4533*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
4534*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
4535*c217d954SCole Faust
4536*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
4537*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
4538*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
4539*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
4540*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
4541*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
4542*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
4543*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
4544*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
4545*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
4546*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
4547*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
4548*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
4549*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
4550*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
4551*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
4552*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
4553*c217d954SCole Faust
4554*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
4555*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
4556*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
4557*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
4558*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
4559*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
4560*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
4561*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
4562*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
4563*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
4564*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
4565*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
4566*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
4567*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
4568*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
4569*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
4570*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
4571*c217d954SCole Faust
4572*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
4573*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
4574*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
4575*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
4576*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
4577*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
4578*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
4579*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
4580*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
4581*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
4582*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
4583*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
4584*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
4585*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
4586*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
4587*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
4588*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
4589*c217d954SCole Faust
4590*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
4591*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
4592*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
4593*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
4594*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
4595*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
4596*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
4597*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
4598*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
4599*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
4600*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
4601*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
4602*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
4603*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
4604*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
4605*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
4606*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
4607*c217d954SCole Faust
4608*c217d954SCole Faust
4609*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
4610*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
4611*c217d954SCole Faust
4612*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
4613*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
4614*c217d954SCole Faust
4615*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
4616*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
4617*c217d954SCole Faust
4618*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
4619*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
4620*c217d954SCole Faust
4621*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
4622*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4623*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
4624*c217d954SCole Faust
4625*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
4626*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4627*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
4628*c217d954SCole Faust
4629*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
4630*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4631*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
4632*c217d954SCole Faust
4633*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
4634*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
4635*c217d954SCole Faust
4636*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
4637*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4638*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
4639*c217d954SCole Faust
4640*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
4641*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4642*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
4643*c217d954SCole Faust
4644*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
4645*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4646*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
4647*c217d954SCole Faust
4648*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
4649*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4650*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
4651*c217d954SCole Faust
4652*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
4653*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4654*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
4655*c217d954SCole Faust
4656*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
4657*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4658*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
4659*c217d954SCole Faust
4660*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
4661*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4662*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
4663*c217d954SCole Faust
4664*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
4665*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
4666*c217d954SCole Faust
4667*c217d954SCole Faust
4668*c217d954SCole Faust
4669*c217d954SCole Faust
4670*c217d954SCole Faust
4671*c217d954SCole Faust#define convert_float_sat convert_float
4672*c217d954SCole Faust#define convert_float1_sat convert_float
4673*c217d954SCole Faust#define convert_float2_sat convert_float2
4674*c217d954SCole Faust#define convert_float3_sat convert_float3
4675*c217d954SCole Faust#define convert_float4_sat convert_float4
4676*c217d954SCole Faust#define convert_float8_sat convert_float8
4677*c217d954SCole Faust#define convert_float16_sat convert_float16
4678*c217d954SCole Faust#define convert_half_sat convert_float
4679*c217d954SCole Faust#define convert_half1_sat convert_half
4680*c217d954SCole Faust#define convert_half2_sat convert_half2
4681*c217d954SCole Faust#define convert_half3_sat convert_half3
4682*c217d954SCole Faust#define convert_half4_sat convert_half4
4683*c217d954SCole Faust#define convert_half8_sat convert_half8
4684*c217d954SCole Faust#define convert_half16_sat convert_half16
4685*c217d954SCole Faust
4686*c217d954SCole Faust#define convert_float1 convert_float
4687*c217d954SCole Faust#define convert_half1 convert_half
4688*c217d954SCole Faust#define convert_char1 convert_char
4689*c217d954SCole Faust#define convert_uchar1 convert_uchar
4690*c217d954SCole Faust#define convert_short1 convert_short
4691*c217d954SCole Faust#define convert_ushort1 convert_ushort
4692*c217d954SCole Faust#define convert_int1 convert_int
4693*c217d954SCole Faust#define convert_uint1 convert_uint
4694*c217d954SCole Faust#define convert_long1 convert_long
4695*c217d954SCole Faust#define convert_ulong1 convert_ulong
4696*c217d954SCole Faust#define convert_double1 convert_double
4697*c217d954SCole Faust
4698*c217d954SCole Faust#define convert_char1_sat convert_char_sat
4699*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
4700*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
4701*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
4702*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
4703*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
4704*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
4705*c217d954SCole Faust#define convert_short1_sat convert_short_sat
4706*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
4707*c217d954SCole Faust#define convert_int1_sat convert_int_sat
4708*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
4709*c217d954SCole Faust#define convert_long1_sat convert_long_sat
4710*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
4711*c217d954SCole Faust#define convert_double1_sat convert_double_sat
4712*c217d954SCole Faust
4713*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
4714*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
4715*c217d954SCole Faust
4716*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
4717*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
4718*c217d954SCole Faust
4719*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
4720*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
4721*c217d954SCole Faust
4722*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
4723*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
4724*c217d954SCole Faust
4725*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
4726*c217d954SCole Faust#define select_vec_dt_char(size) char##size
4727*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
4728*c217d954SCole Faust#define select_vec_dt_short(size) short##size
4729*c217d954SCole Faust#define select_vec_dt_half(size) short##size
4730*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
4731*c217d954SCole Faust#define select_vec_dt_int(size) int##size
4732*c217d954SCole Faust#define select_vec_dt_float(size) int##size
4733*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
4734*c217d954SCole Faust#define select_vec_dt_long(size) long##size
4735*c217d954SCole Faust
4736*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
4737*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
4738*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
4739*c217d954SCole Faust
4740*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
4741*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
4742*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
4743*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
4744*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
4745*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
4746*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
4747*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
4748*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
4749*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
4750*c217d954SCole Faust
4751*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
4752*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
4753*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
4754*c217d954SCole Faust
4755*c217d954SCole Faust#define sum_reduce_1(x) (x)
4756*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
4757*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
4758*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
4759*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
4760*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
4761*c217d954SCole Faust
4762*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
4763*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
4764*c217d954SCole Faust
4765*c217d954SCole Faust#define prod_reduce_1(x) (x)
4766*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
4767*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
4768*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
4769*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
4770*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
4771*c217d954SCole Faust
4772*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
4773*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
4774*c217d954SCole Faust
4775*c217d954SCole Faust#define max_reduce_1(x) (x)
4776*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
4777*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
4778*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
4779*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
4780*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
4781*c217d954SCole Faust
4782*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
4783*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
4784*c217d954SCole Faust
4785*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
4786*c217d954SCole Faust    __global uchar *name##_ptr,      \
4787*c217d954SCole Faust    uint        name##_stride_x, \
4788*c217d954SCole Faust    uint        name##_step_x,   \
4789*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4790*c217d954SCole Faust
4791*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
4792*c217d954SCole Faust    __global uchar *name##_ptr,      \
4793*c217d954SCole Faust    uint        name##_stride_x, \
4794*c217d954SCole Faust    uint        name##_step_x,   \
4795*c217d954SCole Faust    uint        name##_stride_y, \
4796*c217d954SCole Faust    uint        name##_step_y,   \
4797*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4798*c217d954SCole Faust
4799*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
4800*c217d954SCole Faust    __global uchar *name##_ptr,      \
4801*c217d954SCole Faust    uint        name##_stride_x, \
4802*c217d954SCole Faust    uint        name##_step_x,   \
4803*c217d954SCole Faust    uint        name##_stride_y, \
4804*c217d954SCole Faust    uint        name##_step_y,   \
4805*c217d954SCole Faust    uint        name##_stride_z, \
4806*c217d954SCole Faust    uint        name##_step_z,   \
4807*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4808*c217d954SCole Faust
4809*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
4810*c217d954SCole Faust    __global uchar *name##_ptr,      \
4811*c217d954SCole Faust    uint        name##_stride_x, \
4812*c217d954SCole Faust    uint        name##_step_x,   \
4813*c217d954SCole Faust    uint        name##_stride_y, \
4814*c217d954SCole Faust    uint        name##_step_y,   \
4815*c217d954SCole Faust    uint        name##_stride_z, \
4816*c217d954SCole Faust    uint        name##_step_z,   \
4817*c217d954SCole Faust    uint        name##_stride_w, \
4818*c217d954SCole Faust    uint        name##_step_w,   \
4819*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4820*c217d954SCole Faust
4821*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
4822*c217d954SCole Faust    __global uchar *name##_ptr,      \
4823*c217d954SCole Faust    uint        name##_stride_x, \
4824*c217d954SCole Faust    uint        name##_step_x,   \
4825*c217d954SCole Faust    uint        name##_stride_y, \
4826*c217d954SCole Faust    uint        name##_step_y,   \
4827*c217d954SCole Faust    uint        name##_stride_z, \
4828*c217d954SCole Faust    uint        name##_step_z,   \
4829*c217d954SCole Faust    uint        name##_stride_w, \
4830*c217d954SCole Faust    uint        name##_step_w,   \
4831*c217d954SCole Faust    uint        name##_stride_v, \
4832*c217d954SCole Faust    uint        name##_step_v,   \
4833*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4834*c217d954SCole Faust
4835*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
4836*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
4837*c217d954SCole Faust
4838*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
4839*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
4840*c217d954SCole Faust
4841*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
4842*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
4843*c217d954SCole Faust
4844*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
4845*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
4846*c217d954SCole Faust
4847*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4848*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4849*c217d954SCole Faust
4850*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
4851*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
4852*c217d954SCole Faust
4853*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4854*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4855*c217d954SCole Faust
4856*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
4857*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4858*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
4859*c217d954SCole Faust
4860*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
4861*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
4862*c217d954SCole Faust
4863*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
4864*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4865*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
4866*c217d954SCole Faust
4867*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
4868*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
4869*c217d954SCole Faust
4870*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
4871*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4872*c217d954SCole Faust                           name##_stride_z, name##_step_z)
4873*c217d954SCole Faust
4874*c217d954SCole Faust
4875*c217d954SCole Fausttypedef struct Vector
4876*c217d954SCole Faust{
4877*c217d954SCole Faust    __global uchar *ptr;
4878*c217d954SCole Faust    int             offset_first_element_in_bytes;
4879*c217d954SCole Faust    int             stride_x;
4880*c217d954SCole Faust} Vector;
4881*c217d954SCole Faust
4882*c217d954SCole Faust
4883*c217d954SCole Fausttypedef struct Image
4884*c217d954SCole Faust{
4885*c217d954SCole Faust    __global uchar *ptr;
4886*c217d954SCole Faust    int             offset_first_element_in_bytes;
4887*c217d954SCole Faust    int             stride_x;
4888*c217d954SCole Faust    int             stride_y;
4889*c217d954SCole Faust} Image;
4890*c217d954SCole Faust
4891*c217d954SCole Faust
4892*c217d954SCole Fausttypedef struct Tensor3D
4893*c217d954SCole Faust{
4894*c217d954SCole Faust    __global uchar *ptr;
4895*c217d954SCole Faust    int             offset_first_element_in_bytes;
4896*c217d954SCole Faust    int             stride_x;
4897*c217d954SCole Faust    int             stride_y;
4898*c217d954SCole Faust    int             stride_z;
4899*c217d954SCole Faust} Tensor3D;
4900*c217d954SCole Faust
4901*c217d954SCole Faust
4902*c217d954SCole Fausttypedef struct Tensor4D
4903*c217d954SCole Faust{
4904*c217d954SCole Faust    __global uchar *ptr;
4905*c217d954SCole Faust    int             offset_first_element_in_bytes;
4906*c217d954SCole Faust    int             stride_x;
4907*c217d954SCole Faust    int             stride_y;
4908*c217d954SCole Faust    int             stride_z;
4909*c217d954SCole Faust    int             stride_w;
4910*c217d954SCole Faust} Tensor4D;
4911*c217d954SCole Faust
4912*c217d954SCole Faust
4913*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
4914*c217d954SCole Faust{
4915*c217d954SCole Faust    Vector vector =
4916*c217d954SCole Faust    {
4917*c217d954SCole Faust        .ptr                           = ptr,
4918*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4919*c217d954SCole Faust        .stride_x                      = stride_x,
4920*c217d954SCole Faust    };
4921*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
4922*c217d954SCole Faust    return vector;
4923*c217d954SCole Faust}
4924*c217d954SCole Faust
4925*c217d954SCole Faust
4926*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
4927*c217d954SCole Faust{
4928*c217d954SCole Faust    Image img =
4929*c217d954SCole Faust    {
4930*c217d954SCole Faust        .ptr                           = ptr,
4931*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4932*c217d954SCole Faust        .stride_x                      = stride_x,
4933*c217d954SCole Faust        .stride_y                      = stride_y
4934*c217d954SCole Faust    };
4935*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
4936*c217d954SCole Faust    return img;
4937*c217d954SCole Faust}
4938*c217d954SCole Faust
4939*c217d954SCole Faust
4940*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4941*c217d954SCole Faust{
4942*c217d954SCole Faust    Image img =
4943*c217d954SCole Faust    {
4944*c217d954SCole Faust        .ptr                           = ptr,
4945*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4946*c217d954SCole Faust        .stride_x                      = stride_x,
4947*c217d954SCole Faust        .stride_y                      = stride_y
4948*c217d954SCole Faust    };
4949*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4950*c217d954SCole Faust    return img;
4951*c217d954SCole Faust}
4952*c217d954SCole Faust
4953*c217d954SCole Faust
4954*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4955*c217d954SCole Faust{
4956*c217d954SCole Faust    Tensor3D tensor =
4957*c217d954SCole Faust    {
4958*c217d954SCole Faust        .ptr                           = ptr,
4959*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4960*c217d954SCole Faust        .stride_x                      = stride_x,
4961*c217d954SCole Faust        .stride_y                      = stride_y,
4962*c217d954SCole Faust        .stride_z                      = stride_z
4963*c217d954SCole Faust    };
4964*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4965*c217d954SCole Faust    return tensor;
4966*c217d954SCole Faust}
4967*c217d954SCole Faust
4968*c217d954SCole Faust
4969*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4970*c217d954SCole Faust{
4971*c217d954SCole Faust    Tensor3D tensor =
4972*c217d954SCole Faust    {
4973*c217d954SCole Faust        .ptr                           = ptr,
4974*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4975*c217d954SCole Faust        .stride_x                      = stride_x,
4976*c217d954SCole Faust        .stride_y                      = stride_y,
4977*c217d954SCole Faust        .stride_z                      = stride_z
4978*c217d954SCole Faust    };
4979*c217d954SCole Faust    return tensor;
4980*c217d954SCole Faust}
4981*c217d954SCole Faust
4982*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
4983*c217d954SCole Faust                                             uint step_w,
4984*c217d954SCole Faust                                             uint mod_size)
4985*c217d954SCole Faust{
4986*c217d954SCole Faust    Tensor4D tensor =
4987*c217d954SCole Faust    {
4988*c217d954SCole Faust        .ptr                           = ptr,
4989*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4990*c217d954SCole Faust        .stride_x                      = stride_x,
4991*c217d954SCole Faust        .stride_y                      = stride_y,
4992*c217d954SCole Faust        .stride_z                      = stride_z,
4993*c217d954SCole Faust        .stride_w                      = stride_w
4994*c217d954SCole Faust    };
4995*c217d954SCole Faust
4996*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
4997*c217d954SCole Faust    return tensor;
4998*c217d954SCole Faust}
4999*c217d954SCole Faust
5000*c217d954SCole Faust
5001*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
5002*c217d954SCole Faust{
5003*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
5004*c217d954SCole Faust}
5005*c217d954SCole Faust
5006*c217d954SCole Faust
5007*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
5008*c217d954SCole Faust{
5009*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
5010*c217d954SCole Faust}
5011*c217d954SCole Faust
5012*c217d954SCole Faust
5013*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
5014*c217d954SCole Faust{
5015*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
5016*c217d954SCole Faust}
5017*c217d954SCole Faust
5018*c217d954SCole Faust
5019*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
5020*c217d954SCole Faust{
5021*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
5022*c217d954SCole Faust}
5023*c217d954SCole Faust
5024*c217d954SCole Faust
5025*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
5026*c217d954SCole Faust{
5027*c217d954SCole Faust    uint num_elements = width * height;
5028*c217d954SCole Faust
5029*c217d954SCole Faust    const uint z = index / num_elements;
5030*c217d954SCole Faust
5031*c217d954SCole Faust    index %= num_elements;
5032*c217d954SCole Faust
5033*c217d954SCole Faust    const uint y = index / width;
5034*c217d954SCole Faust
5035*c217d954SCole Faust    index %= width;
5036*c217d954SCole Faust
5037*c217d954SCole Faust    const uint x = index;
5038*c217d954SCole Faust
5039*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
5040*c217d954SCole Faust}
5041*c217d954SCole Faust
5042*c217d954SCole Faust#endif
5043*c217d954SCole Faust
5044*c217d954SCole Faust
5045*c217d954SCole Faust
5046*c217d954SCole Faust#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
5047*c217d954SCole Faust#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
5048*c217d954SCole Faust    P_X##_DEF(1, P_A, P_B, P_C);       \
5049*c217d954SCole Faust    REPEAT_3_1(P_X, P_A, P_B, P_C)
5050*c217d954SCole Faust#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
5051*c217d954SCole Faust    P_X##_DEF(2, P_A, P_B, P_C);       \
5052*c217d954SCole Faust    REPEAT_3_2(P_X, P_A, P_B, P_C)
5053*c217d954SCole Faust#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
5054*c217d954SCole Faust    P_X##_DEF(3, P_A, P_B, P_C);       \
5055*c217d954SCole Faust    REPEAT_3_3(P_X, P_A, P_B, P_C)
5056*c217d954SCole Faust#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
5057*c217d954SCole Faust    P_X##_DEF(4, P_A, P_B, P_C);       \
5058*c217d954SCole Faust    REPEAT_3_4(P_X, P_A, P_B, P_C)
5059*c217d954SCole Faust#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
5060*c217d954SCole Faust    P_X##_DEF(5, P_A, P_B, P_C);       \
5061*c217d954SCole Faust    REPEAT_3_5(P_X, P_A, P_B, P_C)
5062*c217d954SCole Faust#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
5063*c217d954SCole Faust    P_X##_DEF(6, P_A, P_B, P_C);       \
5064*c217d954SCole Faust    REPEAT_3_6(P_X, P_A, P_B, P_C)
5065*c217d954SCole Faust#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
5066*c217d954SCole Faust    P_X##_DEF(7, P_A, P_B, P_C);       \
5067*c217d954SCole Faust    REPEAT_3_7(P_X, P_A, P_B, P_C)
5068*c217d954SCole Faust#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
5069*c217d954SCole Faust    P_X##_DEF(8, P_A, P_B, P_C);       \
5070*c217d954SCole Faust    REPEAT_3_8(P_X, P_A, P_B, P_C)
5071*c217d954SCole Faust#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
5072*c217d954SCole Faust    P_X##_DEF(9, P_A, P_B, P_C);        \
5073*c217d954SCole Faust    REPEAT_3_9(P_X, P_A, P_B, P_C)
5074*c217d954SCole Faust#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
5075*c217d954SCole Faust    P_X##_DEF(A, P_A, P_B, P_C);        \
5076*c217d954SCole Faust    REPEAT_3_10(P_X, P_A, P_B, P_C)
5077*c217d954SCole Faust#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
5078*c217d954SCole Faust    P_X##_DEF(B, P_A, P_B, P_C);        \
5079*c217d954SCole Faust    REPEAT_3_11(P_X, P_A, P_B, P_C)
5080*c217d954SCole Faust#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
5081*c217d954SCole Faust    P_X##_DEF(C, P_A, P_B, P_C);        \
5082*c217d954SCole Faust    REPEAT_3_12(P_X, P_A, P_B, P_C)
5083*c217d954SCole Faust#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
5084*c217d954SCole Faust    P_X##_DEF(D, P_A, P_B, P_C);        \
5085*c217d954SCole Faust    REPEAT_3_13(P_X, P_A, P_B, P_C)
5086*c217d954SCole Faust#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
5087*c217d954SCole Faust    P_X##_DEF(E, P_A, P_B, P_C);        \
5088*c217d954SCole Faust    REPEAT_3_14(P_X, P_A, P_B, P_C)
5089*c217d954SCole Faust#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
5090*c217d954SCole Faust    P_X##_DEF(F, P_A, P_B, P_C);        \
5091*c217d954SCole Faust    REPEAT_3_15(P_X, P_A, P_B, P_C)
5092*c217d954SCole Faust
5093*c217d954SCole Faust#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C)
5094*c217d954SCole Faust#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
5095*c217d954SCole Faust
5096*c217d954SCole Faust
5097*c217d954SCole Faust#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D)
5098*c217d954SCole Faust#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \
5099*c217d954SCole Faust    P_X##_DEF(1, P_A, P_B, P_C, P_D);       \
5100*c217d954SCole Faust    REPEAT_4_1(P_X, P_A, P_B, P_C, P_D)
5101*c217d954SCole Faust#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \
5102*c217d954SCole Faust    P_X##_DEF(2, P_A, P_B, P_C, P_D);       \
5103*c217d954SCole Faust    REPEAT_4_2(P_X, P_A, P_B, P_C, P_D)
5104*c217d954SCole Faust#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \
5105*c217d954SCole Faust    P_X##_DEF(3, P_A, P_B, P_C, P_D);       \
5106*c217d954SCole Faust    REPEAT_4_3(P_X, P_A, P_B, P_C, P_D)
5107*c217d954SCole Faust#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \
5108*c217d954SCole Faust    P_X##_DEF(4, P_A, P_B, P_C, P_D);       \
5109*c217d954SCole Faust    REPEAT_4_4(P_X, P_A, P_B, P_C, P_D)
5110*c217d954SCole Faust#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \
5111*c217d954SCole Faust    P_X##_DEF(5, P_A, P_B, P_C, P_D);       \
5112*c217d954SCole Faust    REPEAT_4_5(P_X, P_A, P_B, P_C, P_D)
5113*c217d954SCole Faust#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \
5114*c217d954SCole Faust    P_X##_DEF(6, P_A, P_B, P_C, P_D);       \
5115*c217d954SCole Faust    REPEAT_4_6(P_X, P_A, P_B, P_C, P_D)
5116*c217d954SCole Faust#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \
5117*c217d954SCole Faust    P_X##_DEF(7, P_A, P_B, P_C, P_D);       \
5118*c217d954SCole Faust    REPEAT_4_7(P_X, P_A, P_B, P_C, P_D)
5119*c217d954SCole Faust#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \
5120*c217d954SCole Faust    P_X##_DEF(8, P_A, P_B, P_C, P_D);       \
5121*c217d954SCole Faust    REPEAT_4_8(P_X, P_A, P_B, P_C, P_D)
5122*c217d954SCole Faust#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \
5123*c217d954SCole Faust    P_X##_DEF(9, P_A, P_B, P_C, P_D);        \
5124*c217d954SCole Faust    REPEAT_4_9(P_X, P_A, P_B, P_C, P_D)
5125*c217d954SCole Faust#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \
5126*c217d954SCole Faust    P_X##_DEF(A, P_A, P_B, P_C, P_D);        \
5127*c217d954SCole Faust    REPEAT_4_10(P_X, P_A, P_B, P_C, P_D)
5128*c217d954SCole Faust#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \
5129*c217d954SCole Faust    P_X##_DEF(B, P_A, P_B, P_C, P_D);        \
5130*c217d954SCole Faust    REPEAT_4_11(P_X, P_A, P_B, P_C, P_D)
5131*c217d954SCole Faust#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \
5132*c217d954SCole Faust    P_X##_DEF(C, P_A, P_B, P_C, P_D);        \
5133*c217d954SCole Faust    REPEAT_4_12(P_X, P_A, P_B, P_C, P_D)
5134*c217d954SCole Faust#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \
5135*c217d954SCole Faust    P_X##_DEF(D, P_A, P_B, P_C, P_D);        \
5136*c217d954SCole Faust    REPEAT_4_13(P_X, P_A, P_B, P_C, P_D)
5137*c217d954SCole Faust#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \
5138*c217d954SCole Faust    P_X##_DEF(E, P_A, P_B, P_C, P_D);        \
5139*c217d954SCole Faust    REPEAT_4_14(P_X, P_A, P_B, P_C, P_D)
5140*c217d954SCole Faust#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \
5141*c217d954SCole Faust    P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
5142*c217d954SCole Faust    REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
5143*c217d954SCole Faust
5144*c217d954SCole Faust#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D)
5145*c217d954SCole Faust#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
5146*c217d954SCole Faust
5147*c217d954SCole Faust
5148*c217d954SCole Faust#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
5149*c217d954SCole Faust#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
5150*c217d954SCole Faust
5151*c217d954SCole Faust
5152*c217d954SCole Faust#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
5153*c217d954SCole Faust#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
5154*c217d954SCole Faust
5155*c217d954SCole Faust
5156*c217d954SCole Faust#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
5157*c217d954SCole Faust#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
5158*c217d954SCole Faust
5159*c217d954SCole Faust
5160*c217d954SCole Faust#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
5161*c217d954SCole Faust#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
5162*c217d954SCole Faust
5163*c217d954SCole Faust
5164*c217d954SCole Faust#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
5165*c217d954SCole Faust#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
5166*c217d954SCole Faust
5167*c217d954SCole Faust
5168*c217d954SCole Faust#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
5169*c217d954SCole Faust#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
5170*c217d954SCole Faust
5171*c217d954SCole Faust
5172*c217d954SCole Faust#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
5173*c217d954SCole Faust#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
5174*c217d954SCole Faust
5175*c217d954SCole Faust
5176*c217d954SCole Faust#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
5177*c217d954SCole Faust#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
5178*c217d954SCole Faust
5179*c217d954SCole Faust
5180*c217d954SCole Faust#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
5181*c217d954SCole Faust#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
5182*c217d954SCole Faust
5183*c217d954SCole Faust
5184*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
5185*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
5186*c217d954SCole Faust
5187*c217d954SCole Faust
5188*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
5189*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
5190*c217d954SCole Faust
5191*c217d954SCole Faust
5192*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
5193*c217d954SCole Faust    ({                                                                                                        \
5194*c217d954SCole Faust        VEC_DATA_TYPE(int, N0)                                                                                \
5195*c217d954SCole Faust        VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \
5196*c217d954SCole Faust        VEC_DATA_TYPE(int, N0)                                                                                \
5197*c217d954SCole Faust        VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
5198*c217d954SCole Faust        VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
5199*c217d954SCole Faust    })
5200*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
5201*c217d954SCole Faust
5202*c217d954SCole Faust#endif
5203*c217d954SCole Faust
5204*c217d954SCole Faust#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
5205*c217d954SCole Faust
5206*c217d954SCole Faust#define CONCAT(a, b) a##b
5207*c217d954SCole Faust
5208*c217d954SCole Faust#define ARM_DOT1(a, b, c) \
5209*c217d954SCole Faust    ({                    \
5210*c217d954SCole Faust        c = fma(a, b, c); \
5211*c217d954SCole Faust    })
5212*c217d954SCole Faust#define ARM_DOT2(a, b, c)       \
5213*c217d954SCole Faust    ({                          \
5214*c217d954SCole Faust        c = fma(a.s0, b.s0, c); \
5215*c217d954SCole Faust        c = fma(a.s1, b.s1, c); \
5216*c217d954SCole Faust    })
5217*c217d954SCole Faust#define ARM_DOT3(a, b, c)           \
5218*c217d954SCole Faust    ({                              \
5219*c217d954SCole Faust        ARM_DOT2(a, b, c);          \
5220*c217d954SCole Faust        c = fma((a.s2), (b.s2), c); \
5221*c217d954SCole Faust    })
5222*c217d954SCole Faust#define ARM_DOT4(a, b, c)           \
5223*c217d954SCole Faust    ({                              \
5224*c217d954SCole Faust        ARM_DOT3(a, b, c);          \
5225*c217d954SCole Faust        c = fma((a.s3), (b.s3), c); \
5226*c217d954SCole Faust    })
5227*c217d954SCole Faust#define ARM_DOT8(a, b, c)            \
5228*c217d954SCole Faust    ({                               \
5229*c217d954SCole Faust        ARM_DOT4((a.lo), (b.lo), c); \
5230*c217d954SCole Faust        ARM_DOT4((a.hi), (b.hi), c); \
5231*c217d954SCole Faust    })
5232*c217d954SCole Faust#define ARM_DOT16(a, b, c)           \
5233*c217d954SCole Faust    ({                               \
5234*c217d954SCole Faust        ARM_DOT8((a.lo), (b.lo), c); \
5235*c217d954SCole Faust        ARM_DOT8((a.hi), (b.hi), c); \
5236*c217d954SCole Faust    })
5237*c217d954SCole Faust
5238*c217d954SCole Faust#if N0 == 2
5239*c217d954SCole Faust#define ARM_DOT_K0XN0(k0, a, b, c) \
5240*c217d954SCole Faust    ({                             \
5241*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5242*c217d954SCole Faust        ((a), (b##0), (c.s0));     \
5243*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5244*c217d954SCole Faust        ((a), (b##1), (c.s1));     \
5245*c217d954SCole Faust    })
5246*c217d954SCole Faust#elif N0 == 3
5247*c217d954SCole Faust#define ARM_DOT_K0XN0(k0, a, b, c) \
5248*c217d954SCole Faust    ({                             \
5249*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5250*c217d954SCole Faust        ((a), (b##0), (c.s0));     \
5251*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5252*c217d954SCole Faust        ((a), (b##1), (c.s1));     \
5253*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5254*c217d954SCole Faust        ((a), (b##2), (c.s2));     \
5255*c217d954SCole Faust    })
5256*c217d954SCole Faust#elif N0 == 4
5257*c217d954SCole Faust#define ARM_DOT_K0XN0(k0, a, b, c) \
5258*c217d954SCole Faust    ({                             \
5259*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5260*c217d954SCole Faust        ((a), (b##0), (c.s0));     \
5261*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5262*c217d954SCole Faust        ((a), (b##1), (c.s1));     \
5263*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5264*c217d954SCole Faust        ((a), (b##2), (c.s2));     \
5265*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5266*c217d954SCole Faust        ((a), (b##3), (c.s3));     \
5267*c217d954SCole Faust    })
5268*c217d954SCole Faust#elif N0 == 8
5269*c217d954SCole Faust#define ARM_DOT_K0XN0(k0, a, b, c) \
5270*c217d954SCole Faust    ({                             \
5271*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5272*c217d954SCole Faust        ((a), (b##0), (c.s0));     \
5273*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5274*c217d954SCole Faust        ((a), (b##1), (c.s1));     \
5275*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5276*c217d954SCole Faust        ((a), (b##2), (c.s2));     \
5277*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5278*c217d954SCole Faust        ((a), (b##3), (c.s3));     \
5279*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5280*c217d954SCole Faust        ((a), (b##4), (c.s4));     \
5281*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5282*c217d954SCole Faust        ((a), (b##5), (c.s5));     \
5283*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5284*c217d954SCole Faust        ((a), (b##6), (c.s6));     \
5285*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5286*c217d954SCole Faust        ((a), (b##7), (c.s7));     \
5287*c217d954SCole Faust    })
5288*c217d954SCole Faust#elif N0 == 16
5289*c217d954SCole Faust#define ARM_DOT_K0XN0(k0, a, b, c) \
5290*c217d954SCole Faust    ({                             \
5291*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5292*c217d954SCole Faust        ((a), (b##0), (c.s0));     \
5293*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5294*c217d954SCole Faust        ((a), (b##1), (c.s1));     \
5295*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5296*c217d954SCole Faust        ((a), (b##2), (c.s2));     \
5297*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5298*c217d954SCole Faust        ((a), (b##3), (c.s3));     \
5299*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5300*c217d954SCole Faust        ((a), (b##4), (c.s4));     \
5301*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5302*c217d954SCole Faust        ((a), (b##5), (c.s5));     \
5303*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5304*c217d954SCole Faust        ((a), (b##6), (c.s6));     \
5305*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5306*c217d954SCole Faust        ((a), (b##7), (c.s7));     \
5307*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5308*c217d954SCole Faust        ((a), (b##8), (c.s8));     \
5309*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5310*c217d954SCole Faust        ((a), (b##9), (c.s9));     \
5311*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5312*c217d954SCole Faust        ((a), (b##A), (c.sA));     \
5313*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5314*c217d954SCole Faust        ((a), (b##B), (c.sB));     \
5315*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5316*c217d954SCole Faust        ((a), (b##C), (c.sC));     \
5317*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5318*c217d954SCole Faust        ((a), (b##D), (c.sD));     \
5319*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5320*c217d954SCole Faust        ((a), (b##E), (c.sE));     \
5321*c217d954SCole Faust        CONCAT(ARM_DOT, k0)        \
5322*c217d954SCole Faust        ((a), (b##F), (c.sF));     \
5323*c217d954SCole Faust    })
5324*c217d954SCole Faust#else
5325*c217d954SCole Faust#error "N0 value not supported"
5326*c217d954SCole Faust#endif
5327*c217d954SCole Faust
5328*c217d954SCole Faust#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T)
5329*c217d954SCole Faust
5330*c217d954SCole Faust__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
5331*c217d954SCole Faust                                          IMAGE_DECLARATION(rhs),
5332*c217d954SCole Faust#if defined(BETA)
5333*c217d954SCole Faust                                          IMAGE_DECLARATION(bias),
5334*c217d954SCole Faust#endif
5335*c217d954SCole Faust                                          IMAGE_DECLARATION(dst),
5336*c217d954SCole Faust                                          uint lhs_stride_z,
5337*c217d954SCole Faust                                          uint rhs_stride_z,
5338*c217d954SCole Faust#if defined(BETA)
5339*c217d954SCole Faust                                          uint bias_stride_z,
5340*c217d954SCole Faust#endif
5341*c217d954SCole Faust                                          uint dst_stride_z
5342*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
5343*c217d954SCole Faust                                          ,
5344*c217d954SCole Faust                                          uint lhs_cross_plane_pad
5345*c217d954SCole Faust#endif
5346*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
5347*c217d954SCole Faust                                          ,
5348*c217d954SCole Faust                                          uint dst_cross_plane_pad
5349*c217d954SCole Faust#endif
5350*c217d954SCole Faust                                          ,
5351*c217d954SCole Faust                                          const int M,
5352*c217d954SCole Faust                                          const int N,
5353*c217d954SCole Faust                                          const int K)
5354*c217d954SCole Faust{
5355*c217d954SCole Faust
5356*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0))
5357*c217d954SCole Faust
5358*c217d954SCole Faust
5359*c217d954SCole Faust#if defined(RHS_INTERLEAVE)
5360*c217d954SCole Faust#define RHS_OFFSET_X (K0)
5361*c217d954SCole Faust#define RHS_STEP_X ((K0) * (H0))
5362*c217d954SCole Faust#define RHS_STEP_LOOP (1)
5363*c217d954SCole Faust#else
5364*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
5365*c217d954SCole Faust#define RHS_STEP_X (K0)
5366*c217d954SCole Faust#define RHS_STEP_LOOP (H0)
5367*c217d954SCole Faust#endif
5368*c217d954SCole Faust
5369*c217d954SCole Faust    uint x = get_global_id(0);
5370*c217d954SCole Faust    uint y = get_global_id(1);
5371*c217d954SCole Faust    uint z = get_global_id(2);
5372*c217d954SCole Faust
5373*c217d954SCole Faust    const bool cond_y = y == 0;
5374*c217d954SCole Faust    const bool cond_x = ((x + 1) * N0 >= N);
5375*c217d954SCole Faust
5376*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
5377*c217d954SCole Faust    if((x * N0 >= N) || (y * M0 >= M))
5378*c217d954SCole Faust    {
5379*c217d954SCole Faust        return;
5380*c217d954SCole Faust    }
5381*c217d954SCole Faust#endif
5382*c217d954SCole Faust
5383*c217d954SCole Faust
5384*c217d954SCole Faust    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
5385*c217d954SCole Faust
5386*c217d954SCole Faust
5387*c217d954SCole Faust    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
5388*c217d954SCole Faust
5389*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
5390*c217d954SCole Faust
5391*c217d954SCole Faust    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
5392*c217d954SCole Faust#else
5393*c217d954SCole Faust    rhs_offset += z * rhs_stride_z;
5394*c217d954SCole Faust#endif
5395*c217d954SCole Faust
5396*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
5397*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
5398*c217d954SCole Faust
5399*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
5400*c217d954SCole Faust
5401*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
5402*c217d954SCole Faust
5403*c217d954SCole Faust
5404*c217d954SCole Faust
5405*c217d954SCole Faust    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
5406*c217d954SCole Faust
5407*c217d954SCole Faust#else
5408*c217d954SCole Faust
5409*c217d954SCole Faust
5410*c217d954SCole Faust    lhs_offset += z * lhs_stride_z;
5411*c217d954SCole Faust
5412*c217d954SCole Faust#endif
5413*c217d954SCole Faust
5414*c217d954SCole Faust
5415*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
5416*c217d954SCole Faust
5417*c217d954SCole Faust    int i = 0;
5418*c217d954SCole Faust    for(; i <= (K - K0); i += K0)
5419*c217d954SCole Faust    {
5420*c217d954SCole Faust
5421*c217d954SCole Faust
5422*c217d954SCole Faust
5423*c217d954SCole Faust
5424*c217d954SCole Faust
5425*c217d954SCole Faust
5426*c217d954SCole Faust
5427*c217d954SCole Faust
5428*c217d954SCole Faust
5429*c217d954SCole Faust
5430*c217d954SCole Faust        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
5431*c217d954SCole Faust
5432*c217d954SCole Faust
5433*c217d954SCole Faust        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
5434*c217d954SCole Faust
5435*c217d954SCole Faust
5436*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a0, b, c0);
5437*c217d954SCole Faust#if M0 > 1
5438*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a1, b, c1);
5439*c217d954SCole Faust#endif
5440*c217d954SCole Faust#if M0 > 2
5441*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a2, b, c2);
5442*c217d954SCole Faust#endif
5443*c217d954SCole Faust#if M0 > 3
5444*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a3, b, c3);
5445*c217d954SCole Faust#endif
5446*c217d954SCole Faust#if M0 > 4
5447*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a4, b, c4);
5448*c217d954SCole Faust#endif
5449*c217d954SCole Faust#if M0 > 5
5450*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a5, b, c5);
5451*c217d954SCole Faust#endif
5452*c217d954SCole Faust#if M0 > 6
5453*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a6, b, c6);
5454*c217d954SCole Faust#endif
5455*c217d954SCole Faust#if M0 > 7
5456*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a7, b, c7);
5457*c217d954SCole Faust#endif
5458*c217d954SCole Faust
5459*c217d954SCole Faust        lhs_offset += K0 * sizeof(DATA_TYPE);
5460*c217d954SCole Faust        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
5461*c217d954SCole Faust    }
5462*c217d954SCole Faust
5463*c217d954SCole Faust
5464*c217d954SCole Faust    for(; i < K; ++i)
5465*c217d954SCole Faust    {
5466*c217d954SCole Faust
5467*c217d954SCole Faust        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
5468*c217d954SCole Faust
5469*c217d954SCole Faust
5470*c217d954SCole Faust        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
5471*c217d954SCole Faust
5472*c217d954SCole Faust
5473*c217d954SCole Faust        ARM_DOT_K0XN0(1, a0, b, c0);
5474*c217d954SCole Faust#if M0 > 1
5475*c217d954SCole Faust        ARM_DOT_K0XN0(1, a1, b, c1);
5476*c217d954SCole Faust#endif
5477*c217d954SCole Faust#if M0 > 2
5478*c217d954SCole Faust        ARM_DOT_K0XN0(1, a2, b, c2);
5479*c217d954SCole Faust#endif
5480*c217d954SCole Faust#if M0 > 3
5481*c217d954SCole Faust        ARM_DOT_K0XN0(1, a3, b, c3);
5482*c217d954SCole Faust#endif
5483*c217d954SCole Faust#if M0 > 4
5484*c217d954SCole Faust        ARM_DOT_K0XN0(1, a4, b, c4);
5485*c217d954SCole Faust#endif
5486*c217d954SCole Faust#if M0 > 5
5487*c217d954SCole Faust        ARM_DOT_K0XN0(1, a5, b, c5);
5488*c217d954SCole Faust#endif
5489*c217d954SCole Faust#if M0 > 6
5490*c217d954SCole Faust        ARM_DOT_K0XN0(1, a6, b, c6);
5491*c217d954SCole Faust#endif
5492*c217d954SCole Faust#if M0 > 7
5493*c217d954SCole Faust        ARM_DOT_K0XN0(1, a7, b, c7);
5494*c217d954SCole Faust#endif
5495*c217d954SCole Faust
5496*c217d954SCole Faust        lhs_offset += sizeof(DATA_TYPE);
5497*c217d954SCole Faust        rhs_offset += sizeof(DATA_TYPE);
5498*c217d954SCole Faust    }
5499*c217d954SCole Faust
5500*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
5501*c217d954SCole Faust
5502*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0);
5503*c217d954SCole Faust
5504*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
5505*c217d954SCole Faust
5506*c217d954SCole Faust
5507*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
5508*c217d954SCole Faust
5509*c217d954SCole Faust
5510*c217d954SCole Faust
5511*c217d954SCole Faust    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
5512*c217d954SCole Faust
5513*c217d954SCole Faust#else
5514*c217d954SCole Faust
5515*c217d954SCole Faust
5516*c217d954SCole Faust    dst_addr += z * dst_stride_z;
5517*c217d954SCole Faust
5518*c217d954SCole Faust#endif
5519*c217d954SCole Faust
5520*c217d954SCole Faust
5521*c217d954SCole Faust#if defined(ALPHA)
5522*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
5523*c217d954SCole Faust#endif
5524*c217d954SCole Faust
5525*c217d954SCole Faust
5526*c217d954SCole Faust#if defined(BETA)
5527*c217d954SCole Faust#if defined(BROADCAST_BIAS)
5528*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
5529*c217d954SCole Faust
5530*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
5531*c217d954SCole Faust
5532*c217d954SCole Faust#ifndef UNIT_BETA
5533*c217d954SCole Faust    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
5534*c217d954SCole Faust#endif
5535*c217d954SCole Faust
5536*c217d954SCole Faust
5537*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias0);
5538*c217d954SCole Faust
5539*c217d954SCole Faust#else
5540*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
5541*c217d954SCole Faust
5542*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
5543*c217d954SCole Faust
5544*c217d954SCole Faust#ifndef UNIT_BETA
5545*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
5546*c217d954SCole Faust#endif
5547*c217d954SCole Faust
5548*c217d954SCole Faust
5549*c217d954SCole Faust    ADD_BLOCK(M0, c, bias);
5550*c217d954SCole Faust
5551*c217d954SCole Faust#endif
5552*c217d954SCole Faust#endif
5553*c217d954SCole Faust
5554*c217d954SCole Faust#if defined(ACTIVATION_TYPE)
5555*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
5556*c217d954SCole Faust#endif
5557*c217d954SCole Faust
5558*c217d954SCole Faust
5559*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
5560*c217d954SCole Faust
5561*c217d954SCole Faust#undef RHS_BLOCK_SIZE
5562*c217d954SCole Faust#undef RHS_OFFSET_X
5563*c217d954SCole Faust#undef RHS_STEP_X
5564*c217d954SCole Faust#undef RHS_STEP_LOOP
5565*c217d954SCole Faust}
5566*c217d954SCole Faust#endif
5567*c217d954SCole Faust
5568*c217d954SCole Faust#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)
5569*c217d954SCole Faust
5570*c217d954SCole Faust__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
5571*c217d954SCole Faust                                                  __read_only image2d_t rhs_img,
5572*c217d954SCole Faust#if defined(BETA)
5573*c217d954SCole Faust                                                  IMAGE_DECLARATION(bias),
5574*c217d954SCole Faust#endif
5575*c217d954SCole Faust                                                  IMAGE_DECLARATION(dst),
5576*c217d954SCole Faust                                                  uint lhs_stride_z,
5577*c217d954SCole Faust                                                  uint rhs_stride_z,
5578*c217d954SCole Faust#if defined(BETA)
5579*c217d954SCole Faust                                                  uint bias_stride_z,
5580*c217d954SCole Faust#endif
5581*c217d954SCole Faust                                                  uint dst_stride_z
5582*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
5583*c217d954SCole Faust                                                  ,
5584*c217d954SCole Faust                                                  uint lhs_cross_plane_pad
5585*c217d954SCole Faust#endif
5586*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
5587*c217d954SCole Faust                                                  ,
5588*c217d954SCole Faust                                                  uint dst_cross_plane_pad
5589*c217d954SCole Faust#endif
5590*c217d954SCole Faust                                                  ,
5591*c217d954SCole Faust                                                  const int M,
5592*c217d954SCole Faust                                                  const int N,
5593*c217d954SCole Faust                                                  const int K)
5594*c217d954SCole Faust{
5595*c217d954SCole Faust
5596*c217d954SCole Faust#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
5597*c217d954SCole Faust
5598*c217d954SCole Faust    const uint LEFTOVER_K = K % K0;
5599*c217d954SCole Faust
5600*c217d954SCole Faust
5601*c217d954SCole Faust#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
5602*c217d954SCole Faust
5603*c217d954SCole Faust
5604*c217d954SCole Faust#if defined(RHS_INTERLEAVE)
5605*c217d954SCole Faust#define RHS_OFFSET_X (PIXEL_UNIT)
5606*c217d954SCole Faust#define RHS_STEP_X (PIXEL_UNIT * (H0))
5607*c217d954SCole Faust#define RHS_STEP_LOOP (1)
5608*c217d954SCole Faust#else
5609*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
5610*c217d954SCole Faust#define RHS_STEP_X PIXEL_UNIT
5611*c217d954SCole Faust#define RHS_STEP_LOOP (H0)
5612*c217d954SCole Faust#endif
5613*c217d954SCole Faust
5614*c217d954SCole Faust    uint x = get_global_id(0);
5615*c217d954SCole Faust    uint y = get_global_id(1);
5616*c217d954SCole Faust    uint z = get_global_id(2);
5617*c217d954SCole Faust
5618*c217d954SCole Faust    const bool cond_y = y == 0;
5619*c217d954SCole Faust    const bool cond_x = ((x + 1) * N0 >= N);
5620*c217d954SCole Faust
5621*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
5622*c217d954SCole Faust    if((x * N0 >= N) || (y * M0 >= M))
5623*c217d954SCole Faust    {
5624*c217d954SCole Faust        return;
5625*c217d954SCole Faust    }
5626*c217d954SCole Faust#endif
5627*c217d954SCole Faust
5628*c217d954SCole Faust
5629*c217d954SCole Faust    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
5630*c217d954SCole Faust
5631*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
5632*c217d954SCole Faust
5633*c217d954SCole Faust    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
5634*c217d954SCole Faust#else
5635*c217d954SCole Faust    const uint z_rhs = get_global_id(2);
5636*c217d954SCole Faust#endif
5637*c217d954SCole Faust
5638*c217d954SCole Faust
5639*c217d954SCole Faust    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
5640*c217d954SCole Faust    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
5641*c217d954SCole Faust
5642*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
5643*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
5644*c217d954SCole Faust
5645*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
5646*c217d954SCole Faust
5647*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
5648*c217d954SCole Faust
5649*c217d954SCole Faust
5650*c217d954SCole Faust
5651*c217d954SCole Faust    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
5652*c217d954SCole Faust
5653*c217d954SCole Faust#else
5654*c217d954SCole Faust
5655*c217d954SCole Faust
5656*c217d954SCole Faust    lhs_offset += z * lhs_stride_z;
5657*c217d954SCole Faust
5658*c217d954SCole Faust#endif
5659*c217d954SCole Faust
5660*c217d954SCole Faust
5661*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
5662*c217d954SCole Faust
5663*c217d954SCole Faust    int i = 0;
5664*c217d954SCole Faust    for(; i <= (K - K0); i += K0)
5665*c217d954SCole Faust    {
5666*c217d954SCole Faust
5667*c217d954SCole Faust        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
5668*c217d954SCole Faust
5669*c217d954SCole Faust
5670*c217d954SCole Faust        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
5671*c217d954SCole Faust        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
5672*c217d954SCole Faust
5673*c217d954SCole Faust
5674*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a0, b, c0);
5675*c217d954SCole Faust#if M0 > 1
5676*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a1, b, c1);
5677*c217d954SCole Faust#endif
5678*c217d954SCole Faust#if M0 > 2
5679*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a2, b, c2);
5680*c217d954SCole Faust#endif
5681*c217d954SCole Faust#if M0 > 3
5682*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a3, b, c3);
5683*c217d954SCole Faust#endif
5684*c217d954SCole Faust#if M0 > 4
5685*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a4, b, c4);
5686*c217d954SCole Faust#endif
5687*c217d954SCole Faust#if M0 > 5
5688*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a5, b, c5);
5689*c217d954SCole Faust#endif
5690*c217d954SCole Faust#if M0 > 6
5691*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a6, b, c6);
5692*c217d954SCole Faust#endif
5693*c217d954SCole Faust#if M0 > 7
5694*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a7, b, c7);
5695*c217d954SCole Faust#endif
5696*c217d954SCole Faust
5697*c217d954SCole Faust        lhs_offset += K0 * sizeof(DATA_TYPE);
5698*c217d954SCole Faust        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
5699*c217d954SCole Faust    }
5700*c217d954SCole Faust
5701*c217d954SCole Faust    if(LEFTOVER_K != 0)
5702*c217d954SCole Faust    {
5703*c217d954SCole Faust
5704*c217d954SCole Faust
5705*c217d954SCole Faust
5706*c217d954SCole Faust
5707*c217d954SCole Faust        union UNION_VEC_TYPE
5708*c217d954SCole Faust        {
5709*c217d954SCole Faust            DATA_TYPE s[K0];
5710*c217d954SCole Faust            VEC_DATA_TYPE(DATA_TYPE, K0)
5711*c217d954SCole Faust            v;
5712*c217d954SCole Faust        };
5713*c217d954SCole Faust
5714*c217d954SCole Faust        union UNION_VEC_TYPE a0 = {.v = 0 };
5715*c217d954SCole Faust#if M0 > 1
5716*c217d954SCole Faust        union UNION_VEC_TYPE a1 = {.v = 0 };
5717*c217d954SCole Faust#endif
5718*c217d954SCole Faust#if M0 > 2
5719*c217d954SCole Faust        union UNION_VEC_TYPE a2 = {.v = 0 };
5720*c217d954SCole Faust#endif
5721*c217d954SCole Faust#if M0 > 3
5722*c217d954SCole Faust        union UNION_VEC_TYPE a3 = {.v = 0 };
5723*c217d954SCole Faust#endif
5724*c217d954SCole Faust#if M0 > 4
5725*c217d954SCole Faust        union UNION_VEC_TYPE a4 = {.v = 0 };
5726*c217d954SCole Faust#endif
5727*c217d954SCole Faust#if M0 > 5
5728*c217d954SCole Faust        union UNION_VEC_TYPE a5 = {.v = 0 };
5729*c217d954SCole Faust#endif
5730*c217d954SCole Faust#if M0 > 6
5731*c217d954SCole Faust        union UNION_VEC_TYPE a6 = {.v = 0 };
5732*c217d954SCole Faust#endif
5733*c217d954SCole Faust#if M0 > 7
5734*c217d954SCole Faust        union UNION_VEC_TYPE a7 = {.v = 0 };
5735*c217d954SCole Faust#endif
5736*c217d954SCole Faust
5737*c217d954SCole Faust        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
5738*c217d954SCole Faust
5739*c217d954SCole Faust
5740*c217d954SCole Faust        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
5741*c217d954SCole Faust
5742*c217d954SCole Faust
5743*c217d954SCole Faust        for(int k = 0; k < LEFTOVER_K; ++k)
5744*c217d954SCole Faust        {
5745*c217d954SCole Faust            a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
5746*c217d954SCole Faust#if M0 > 1
5747*c217d954SCole Faust            a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
5748*c217d954SCole Faust#endif
5749*c217d954SCole Faust#if M0 > 2
5750*c217d954SCole Faust            a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
5751*c217d954SCole Faust#endif
5752*c217d954SCole Faust#if M0 > 3
5753*c217d954SCole Faust            a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
5754*c217d954SCole Faust#endif
5755*c217d954SCole Faust#if M0 > 4
5756*c217d954SCole Faust            a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
5757*c217d954SCole Faust#endif
5758*c217d954SCole Faust#if M0 > 5
5759*c217d954SCole Faust            a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
5760*c217d954SCole Faust#endif
5761*c217d954SCole Faust#if M0 > 6
5762*c217d954SCole Faust            a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
5763*c217d954SCole Faust#endif
5764*c217d954SCole Faust#if M0 > 7
5765*c217d954SCole Faust            a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
5766*c217d954SCole Faust#endif
5767*c217d954SCole Faust
5768*c217d954SCole Faust            lhs_offset += sizeof(DATA_TYPE);
5769*c217d954SCole Faust        }
5770*c217d954SCole Faust
5771*c217d954SCole Faust
5772*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a0.v, b, c0);
5773*c217d954SCole Faust#if M0 > 1
5774*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a1.v, b, c1);
5775*c217d954SCole Faust#endif
5776*c217d954SCole Faust#if M0 > 2
5777*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a2.v, b, c2);
5778*c217d954SCole Faust#endif
5779*c217d954SCole Faust#if M0 > 3
5780*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a3.v, b, c3);
5781*c217d954SCole Faust#endif
5782*c217d954SCole Faust#if M0 > 4
5783*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a4.v, b, c4);
5784*c217d954SCole Faust#endif
5785*c217d954SCole Faust#if M0 > 5
5786*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a5.v, b, c5);
5787*c217d954SCole Faust#endif
5788*c217d954SCole Faust#if M0 > 6
5789*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a6.v, b, c6);
5790*c217d954SCole Faust#endif
5791*c217d954SCole Faust#if M0 > 7
5792*c217d954SCole Faust        ARM_DOT_K0XN0(K0, a7.v, b, c7);
5793*c217d954SCole Faust#endif
5794*c217d954SCole Faust    }
5795*c217d954SCole Faust
5796*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
5797*c217d954SCole Faust
5798*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
5799*c217d954SCole Faust
5800*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
5801*c217d954SCole Faust
5802*c217d954SCole Faust
5803*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
5804*c217d954SCole Faust
5805*c217d954SCole Faust
5806*c217d954SCole Faust
5807*c217d954SCole Faust    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
5808*c217d954SCole Faust
5809*c217d954SCole Faust#else
5810*c217d954SCole Faust
5811*c217d954SCole Faust
5812*c217d954SCole Faust    dst_addr += z * dst_stride_z;
5813*c217d954SCole Faust
5814*c217d954SCole Faust#endif
5815*c217d954SCole Faust
5816*c217d954SCole Faust
5817*c217d954SCole Faust#if defined(ALPHA)
5818*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
5819*c217d954SCole Faust#endif
5820*c217d954SCole Faust
5821*c217d954SCole Faust
5822*c217d954SCole Faust#if defined(BETA)
5823*c217d954SCole Faust#if defined(BROADCAST_BIAS)
5824*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
5825*c217d954SCole Faust
5826*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
5827*c217d954SCole Faust
5828*c217d954SCole Faust#ifndef UNIT_BETA
5829*c217d954SCole Faust    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
5830*c217d954SCole Faust#endif
5831*c217d954SCole Faust
5832*c217d954SCole Faust
5833*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias0);
5834*c217d954SCole Faust
5835*c217d954SCole Faust#else
5836*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
5837*c217d954SCole Faust
5838*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
5839*c217d954SCole Faust
5840*c217d954SCole Faust#ifndef UNIT_BETA
5841*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
5842*c217d954SCole Faust#endif
5843*c217d954SCole Faust
5844*c217d954SCole Faust
5845*c217d954SCole Faust    ADD_BLOCK(M0, c, bias);
5846*c217d954SCole Faust
5847*c217d954SCole Faust#endif
5848*c217d954SCole Faust#endif
5849*c217d954SCole Faust
5850*c217d954SCole Faust#if defined(ACTIVATION_TYPE)
5851*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
5852*c217d954SCole Faust#endif
5853*c217d954SCole Faust
5854*c217d954SCole Faust
5855*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
5856*c217d954SCole Faust
5857*c217d954SCole Faust#undef RHS_BLOCK_SIZE
5858*c217d954SCole Faust#undef RHS_OFFSET_X
5859*c217d954SCole Faust#undef RHS_STEP_X
5860*c217d954SCole Faust#undef RHS_STEP_LOOP
5861*c217d954SCole Faust#undef PIXEL_UNIT
5862*c217d954SCole Faust}
5863*c217d954SCole Faust#endif
5864*c217d954SCole Faust
5865*c217d954SCole Faust#define VFMA(a, b, c)     \
5866*c217d954SCole Faust    ({                    \
5867*c217d954SCole Faust        c = fma(a, b, c); \
5868*c217d954SCole Faust    })
5869*c217d954SCole Faust
5870*c217d954SCole Faust#if M0 == 1
5871*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c)                                        \
5872*c217d954SCole Faust    ({                                                                \
5873*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5874*c217d954SCole Faust    })
5875*c217d954SCole Faust#elif M0 == 2
5876*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c)                                        \
5877*c217d954SCole Faust    ({                                                                \
5878*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5879*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5880*c217d954SCole Faust    })
5881*c217d954SCole Faust#elif M0 == 3
5882*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c)                                        \
5883*c217d954SCole Faust    ({                                                                \
5884*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5885*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5886*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5887*c217d954SCole Faust    })
5888*c217d954SCole Faust#elif M0 == 4
5889*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c)                                        \
5890*c217d954SCole Faust    ({                                                                \
5891*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5892*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5893*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5894*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
5895*c217d954SCole Faust    })
5896*c217d954SCole Faust#elif M0 == 5
5897*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c)                                        \
5898*c217d954SCole Faust    ({                                                                \
5899*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5900*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5901*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5902*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
5903*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
5904*c217d954SCole Faust    })
5905*c217d954SCole Faust#elif M0 == 6
5906*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c)                                        \
5907*c217d954SCole Faust    ({                                                                \
5908*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5909*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5910*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5911*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
5912*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
5913*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
5914*c217d954SCole Faust    })
5915*c217d954SCole Faust#elif M0 == 7
5916*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c)                                        \
5917*c217d954SCole Faust    ({                                                                \
5918*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5919*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5920*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5921*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
5922*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
5923*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
5924*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
5925*c217d954SCole Faust    })
5926*c217d954SCole Faust#elif M0 == 8
5927*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c)                                        \
5928*c217d954SCole Faust    ({                                                                \
5929*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5930*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5931*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5932*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
5933*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
5934*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
5935*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
5936*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
5937*c217d954SCole Faust    })
5938*c217d954SCole Faust#else
5939*c217d954SCole Faust#error "M0 not supported"
5940*c217d954SCole Faust#endif
5941*c217d954SCole Faust
5942*c217d954SCole Faust#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)
5943*c217d954SCole Faust
5944*c217d954SCole Faust__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
5945*c217d954SCole Faust                                           IMAGE_DECLARATION(rhs),
5946*c217d954SCole Faust#if defined(BETA)
5947*c217d954SCole Faust                                           IMAGE_DECLARATION(bias),
5948*c217d954SCole Faust#endif
5949*c217d954SCole Faust                                           IMAGE_DECLARATION(dst),
5950*c217d954SCole Faust                                           uint lhs_stride_z,
5951*c217d954SCole Faust                                           uint rhs_stride_z,
5952*c217d954SCole Faust#if defined(BETA)
5953*c217d954SCole Faust                                           uint bias_stride_z,
5954*c217d954SCole Faust#endif
5955*c217d954SCole Faust                                           uint dst_stride_z
5956*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
5957*c217d954SCole Faust                                           ,
5958*c217d954SCole Faust                                           uint lhs_cross_plane_pad
5959*c217d954SCole Faust#endif
5960*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
5961*c217d954SCole Faust                                           ,
5962*c217d954SCole Faust                                           uint dst_cross_plane_pad
5963*c217d954SCole Faust#endif
5964*c217d954SCole Faust                                           ,
5965*c217d954SCole Faust                                           const int M,
5966*c217d954SCole Faust                                           const int N,
5967*c217d954SCole Faust                                           const int K)
5968*c217d954SCole Faust{
5969*c217d954SCole Faust
5970*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0))
5971*c217d954SCole Faust
5972*c217d954SCole Faust
5973*c217d954SCole Faust#if defined(RHS_INTERLEAVE)
5974*c217d954SCole Faust#define RHS_OFFSET_X (N0)
5975*c217d954SCole Faust#define RHS_STEP_X ((N0) * (H0))
5976*c217d954SCole Faust#define RHS_STEP_LOOP (1)
5977*c217d954SCole Faust#else
5978*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
5979*c217d954SCole Faust#define RHS_STEP_X (N0)
5980*c217d954SCole Faust#define RHS_STEP_LOOP (H0)
5981*c217d954SCole Faust#endif
5982*c217d954SCole Faust
5983*c217d954SCole Faust    uint x = get_global_id(0);
5984*c217d954SCole Faust    uint y = get_global_id(1);
5985*c217d954SCole Faust    uint z = get_global_id(2);
5986*c217d954SCole Faust
5987*c217d954SCole Faust    const bool cond_y = y == 0;
5988*c217d954SCole Faust    const bool cond_x = ((x + 1) * N0 >= N);
5989*c217d954SCole Faust
5990*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
5991*c217d954SCole Faust    if((x * N0 >= N) || (y * M0 >= M))
5992*c217d954SCole Faust    {
5993*c217d954SCole Faust        return;
5994*c217d954SCole Faust    }
5995*c217d954SCole Faust#endif
5996*c217d954SCole Faust
5997*c217d954SCole Faust
5998*c217d954SCole Faust    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
5999*c217d954SCole Faust
6000*c217d954SCole Faust
6001*c217d954SCole Faust    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
6002*c217d954SCole Faust
6003*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
6004*c217d954SCole Faust
6005*c217d954SCole Faust    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
6006*c217d954SCole Faust#else
6007*c217d954SCole Faust    rhs_offset += z * rhs_stride_z;
6008*c217d954SCole Faust#endif
6009*c217d954SCole Faust
6010*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
6011*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
6012*c217d954SCole Faust
6013*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
6014*c217d954SCole Faust
6015*c217d954SCole Faust
6016*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
6017*c217d954SCole Faust
6018*c217d954SCole Faust
6019*c217d954SCole Faust
6020*c217d954SCole Faust    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
6021*c217d954SCole Faust
6022*c217d954SCole Faust#else
6023*c217d954SCole Faust
6024*c217d954SCole Faust
6025*c217d954SCole Faust    lhs_offset += z * lhs_stride_z;
6026*c217d954SCole Faust
6027*c217d954SCole Faust#endif
6028*c217d954SCole Faust
6029*c217d954SCole Faust
6030*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
6031*c217d954SCole Faust
6032*c217d954SCole Faust    int i = 0;
6033*c217d954SCole Faust    for(; i <= (K - K0); i += K0)
6034*c217d954SCole Faust    {
6035*c217d954SCole Faust
6036*c217d954SCole Faust
6037*c217d954SCole Faust
6038*c217d954SCole Faust
6039*c217d954SCole Faust
6040*c217d954SCole Faust
6041*c217d954SCole Faust
6042*c217d954SCole Faust
6043*c217d954SCole Faust
6044*c217d954SCole Faust
6045*c217d954SCole Faust        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
6046*c217d954SCole Faust
6047*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, N0)
6048*c217d954SCole Faust        b0;
6049*c217d954SCole Faust
6050*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
6051*c217d954SCole Faust        VFMA_M0xN0(0, a, b0, c);
6052*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
6053*c217d954SCole Faust        VFMA_M0xN0(1, a, b0, c);
6054*c217d954SCole Faust#if K0 > 2
6055*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
6056*c217d954SCole Faust        VFMA_M0xN0(2, a, b0, c);
6057*c217d954SCole Faust#endif
6058*c217d954SCole Faust#if K0 > 3
6059*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
6060*c217d954SCole Faust        VFMA_M0xN0(3, a, b0, c);
6061*c217d954SCole Faust#endif
6062*c217d954SCole Faust#if K0 > 4
6063*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
6064*c217d954SCole Faust        VFMA_M0xN0(4, a, b0, c);
6065*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
6066*c217d954SCole Faust        VFMA_M0xN0(5, a, b0, c);
6067*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
6068*c217d954SCole Faust        VFMA_M0xN0(6, a, b0, c);
6069*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
6070*c217d954SCole Faust        VFMA_M0xN0(7, a, b0, c);
6071*c217d954SCole Faust#endif
6072*c217d954SCole Faust#if K0 > 8
6073*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
6074*c217d954SCole Faust        VFMA_M0xN0(8, a, b0, c);
6075*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
6076*c217d954SCole Faust        VFMA_M0xN0(9, a, b0, c);
6077*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
6078*c217d954SCole Faust        VFMA_M0xN0(A, a, b0, c);
6079*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
6080*c217d954SCole Faust        VFMA_M0xN0(B, a, b0, c);
6081*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
6082*c217d954SCole Faust        VFMA_M0xN0(C, a, b0, c);
6083*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
6084*c217d954SCole Faust        VFMA_M0xN0(D, a, b0, c);
6085*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
6086*c217d954SCole Faust        VFMA_M0xN0(E, a, b0, c);
6087*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
6088*c217d954SCole Faust        VFMA_M0xN0(F, a, b0, c);
6089*c217d954SCole Faust#endif
6090*c217d954SCole Faust
6091*c217d954SCole Faust        lhs_offset += K0 * sizeof(DATA_TYPE);
6092*c217d954SCole Faust        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
6093*c217d954SCole Faust    }
6094*c217d954SCole Faust
6095*c217d954SCole Faust
6096*c217d954SCole Faust    for(; i < K; ++i)
6097*c217d954SCole Faust    {
6098*c217d954SCole Faust
6099*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6100*c217d954SCole Faust        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
6101*c217d954SCole Faust#if M0 > 1
6102*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6103*c217d954SCole Faust        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
6104*c217d954SCole Faust#endif
6105*c217d954SCole Faust#if M0 > 2
6106*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6107*c217d954SCole Faust        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
6108*c217d954SCole Faust#endif
6109*c217d954SCole Faust#if M0 > 3
6110*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6111*c217d954SCole Faust        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
6112*c217d954SCole Faust#endif
6113*c217d954SCole Faust#if M0 > 4
6114*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6115*c217d954SCole Faust        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
6116*c217d954SCole Faust#endif
6117*c217d954SCole Faust#if M0 > 5
6118*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6119*c217d954SCole Faust        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
6120*c217d954SCole Faust#endif
6121*c217d954SCole Faust#if M0 > 6
6122*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6123*c217d954SCole Faust        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
6124*c217d954SCole Faust#endif
6125*c217d954SCole Faust#if M0 > 7
6126*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6127*c217d954SCole Faust        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
6128*c217d954SCole Faust#endif
6129*c217d954SCole Faust
6130*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, N0)
6131*c217d954SCole Faust        b0;
6132*c217d954SCole Faust
6133*c217d954SCole Faust        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
6134*c217d954SCole Faust        VFMA_M0xN0(0, a, b0, c);
6135*c217d954SCole Faust
6136*c217d954SCole Faust        lhs_offset += sizeof(DATA_TYPE);
6137*c217d954SCole Faust        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
6138*c217d954SCole Faust    }
6139*c217d954SCole Faust
6140*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
6141*c217d954SCole Faust
6142*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0);
6143*c217d954SCole Faust
6144*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
6145*c217d954SCole Faust
6146*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
6147*c217d954SCole Faust
6148*c217d954SCole Faust
6149*c217d954SCole Faust
6150*c217d954SCole Faust    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
6151*c217d954SCole Faust
6152*c217d954SCole Faust#else
6153*c217d954SCole Faust
6154*c217d954SCole Faust
6155*c217d954SCole Faust    dst_addr += z * dst_stride_z;
6156*c217d954SCole Faust
6157*c217d954SCole Faust#endif
6158*c217d954SCole Faust
6159*c217d954SCole Faust
6160*c217d954SCole Faust#if defined(ALPHA)
6161*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
6162*c217d954SCole Faust#endif
6163*c217d954SCole Faust
6164*c217d954SCole Faust
6165*c217d954SCole Faust#if defined(BETA)
6166*c217d954SCole Faust#if defined(BROADCAST_BIAS)
6167*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
6168*c217d954SCole Faust
6169*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
6170*c217d954SCole Faust
6171*c217d954SCole Faust#ifndef UNIT_BETA
6172*c217d954SCole Faust    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
6173*c217d954SCole Faust#endif
6174*c217d954SCole Faust
6175*c217d954SCole Faust
6176*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias0);
6177*c217d954SCole Faust
6178*c217d954SCole Faust#else
6179*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
6180*c217d954SCole Faust
6181*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6182*c217d954SCole Faust
6183*c217d954SCole Faust#ifndef UNIT_BETA
6184*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
6185*c217d954SCole Faust#endif
6186*c217d954SCole Faust
6187*c217d954SCole Faust
6188*c217d954SCole Faust    ADD_BLOCK(M0, c, bias);
6189*c217d954SCole Faust
6190*c217d954SCole Faust#endif
6191*c217d954SCole Faust#endif
6192*c217d954SCole Faust
6193*c217d954SCole Faust#if defined(ACTIVATION_TYPE)
6194*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
6195*c217d954SCole Faust#endif
6196*c217d954SCole Faust
6197*c217d954SCole Faust
6198*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6199*c217d954SCole Faust
6200*c217d954SCole Faust#undef RHS_BLOCK_SIZE
6201*c217d954SCole Faust#undef RHS_OFFSET_X
6202*c217d954SCole Faust#undef RHS_STEP_X
6203*c217d954SCole Faust#undef RHS_STEP_LOOP
6204*c217d954SCole Faust}
6205*c217d954SCole Faust#endif
6206*c217d954SCole Faust
6207*c217d954SCole Faust#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)
6208*c217d954SCole Faust
6209*c217d954SCole Faust__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
6210*c217d954SCole Faust                                                   __read_only image2d_t rhs_img,
6211*c217d954SCole Faust#if defined(BETA)
6212*c217d954SCole Faust                                                   IMAGE_DECLARATION(bias),
6213*c217d954SCole Faust#endif
6214*c217d954SCole Faust                                                   IMAGE_DECLARATION(dst),
6215*c217d954SCole Faust                                                   uint lhs_stride_z,
6216*c217d954SCole Faust                                                   uint rhs_stride_z,
6217*c217d954SCole Faust#if defined(BETA)
6218*c217d954SCole Faust                                                   uint bias_stride_z,
6219*c217d954SCole Faust#endif
6220*c217d954SCole Faust                                                   uint dst_stride_z
6221*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
6222*c217d954SCole Faust                                                   ,
6223*c217d954SCole Faust                                                   uint lhs_cross_plane_pad
6224*c217d954SCole Faust#endif
6225*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
6226*c217d954SCole Faust                                                   ,
6227*c217d954SCole Faust                                                   uint dst_cross_plane_pad
6228*c217d954SCole Faust#endif
6229*c217d954SCole Faust                                                   ,
6230*c217d954SCole Faust                                                   const int M,
6231*c217d954SCole Faust                                                   const int N,
6232*c217d954SCole Faust                                                   const int K)
6233*c217d954SCole Faust{
6234*c217d954SCole Faust
6235*c217d954SCole Faust#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
6236*c217d954SCole Faust
6237*c217d954SCole Faust
6238*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
6239*c217d954SCole Faust
6240*c217d954SCole Faust
6241*c217d954SCole Faust#if defined(RHS_INTERLEAVE)
6242*c217d954SCole Faust#define RHS_OFFSET_X (PIXEL_UNIT)
6243*c217d954SCole Faust#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
6244*c217d954SCole Faust#define RHS_STEP_LOOP 1
6245*c217d954SCole Faust#else
6246*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
6247*c217d954SCole Faust#define RHS_STEP_X (PIXEL_UNIT)
6248*c217d954SCole Faust#define RHS_STEP_LOOP (H0)
6249*c217d954SCole Faust#endif
6250*c217d954SCole Faust
6251*c217d954SCole Faust    uint x = get_global_id(0);
6252*c217d954SCole Faust    uint y = get_global_id(1);
6253*c217d954SCole Faust    uint z = get_global_id(2);
6254*c217d954SCole Faust
6255*c217d954SCole Faust    const bool cond_y = y == 0;
6256*c217d954SCole Faust    const bool cond_x = ((x + 1) * N0 >= N);
6257*c217d954SCole Faust
6258*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
6259*c217d954SCole Faust    if((x * N0 >= N) || (y * M0 >= M))
6260*c217d954SCole Faust    {
6261*c217d954SCole Faust        return;
6262*c217d954SCole Faust    }
6263*c217d954SCole Faust#endif
6264*c217d954SCole Faust
6265*c217d954SCole Faust
6266*c217d954SCole Faust    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
6267*c217d954SCole Faust
6268*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
6269*c217d954SCole Faust
6270*c217d954SCole Faust    const uint z_rhs = (z % MATRIX_B_DEPTH);
6271*c217d954SCole Faust#else
6272*c217d954SCole Faust    const uint z_rhs = z;
6273*c217d954SCole Faust#endif
6274*c217d954SCole Faust
6275*c217d954SCole Faust
6276*c217d954SCole Faust    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
6277*c217d954SCole Faust    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
6278*c217d954SCole Faust
6279*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
6280*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
6281*c217d954SCole Faust
6282*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
6283*c217d954SCole Faust
6284*c217d954SCole Faust
6285*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
6286*c217d954SCole Faust
6287*c217d954SCole Faust
6288*c217d954SCole Faust
6289*c217d954SCole Faust    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
6290*c217d954SCole Faust
6291*c217d954SCole Faust#else
6292*c217d954SCole Faust
6293*c217d954SCole Faust
6294*c217d954SCole Faust    lhs_offset += z * lhs_stride_z;
6295*c217d954SCole Faust
6296*c217d954SCole Faust#endif
6297*c217d954SCole Faust
6298*c217d954SCole Faust
6299*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
6300*c217d954SCole Faust
6301*c217d954SCole Faust    int i = 0;
6302*c217d954SCole Faust    for(; i <= (K - K0); i += K0)
6303*c217d954SCole Faust    {
6304*c217d954SCole Faust
6305*c217d954SCole Faust        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
6306*c217d954SCole Faust
6307*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, N0)
6308*c217d954SCole Faust        b0;
6309*c217d954SCole Faust
6310*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
6311*c217d954SCole Faust        VFMA_M0xN0(0, a, b0, c);
6312*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
6313*c217d954SCole Faust        VFMA_M0xN0(1, a, b0, c);
6314*c217d954SCole Faust#if K0 > 2
6315*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
6316*c217d954SCole Faust        VFMA_M0xN0(2, a, b0, c);
6317*c217d954SCole Faust#endif
6318*c217d954SCole Faust#if K0 > 3
6319*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
6320*c217d954SCole Faust        VFMA_M0xN0(3, a, b0, c);
6321*c217d954SCole Faust#endif
6322*c217d954SCole Faust#if K0 > 4
6323*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
6324*c217d954SCole Faust        VFMA_M0xN0(4, a, b0, c);
6325*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
6326*c217d954SCole Faust        VFMA_M0xN0(5, a, b0, c);
6327*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
6328*c217d954SCole Faust        VFMA_M0xN0(6, a, b0, c);
6329*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
6330*c217d954SCole Faust        VFMA_M0xN0(7, a, b0, c);
6331*c217d954SCole Faust#endif
6332*c217d954SCole Faust#if K0 > 8
6333*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
6334*c217d954SCole Faust        VFMA_M0xN0(8, a, b0, c);
6335*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
6336*c217d954SCole Faust        VFMA_M0xN0(9, a, b0, c);
6337*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
6338*c217d954SCole Faust        VFMA_M0xN0(A, a, b0, c);
6339*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
6340*c217d954SCole Faust        VFMA_M0xN0(B, a, b0, c);
6341*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
6342*c217d954SCole Faust        VFMA_M0xN0(C, a, b0, c);
6343*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
6344*c217d954SCole Faust        VFMA_M0xN0(D, a, b0, c);
6345*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
6346*c217d954SCole Faust        VFMA_M0xN0(E, a, b0, c);
6347*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
6348*c217d954SCole Faust        VFMA_M0xN0(F, a, b0, c);
6349*c217d954SCole Faust#endif
6350*c217d954SCole Faust
6351*c217d954SCole Faust        lhs_offset += K0 * sizeof(DATA_TYPE);
6352*c217d954SCole Faust        x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
6353*c217d954SCole Faust    }
6354*c217d954SCole Faust
6355*c217d954SCole Faust
6356*c217d954SCole Faust    for(; i < K; ++i)
6357*c217d954SCole Faust    {
6358*c217d954SCole Faust
6359*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6360*c217d954SCole Faust        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
6361*c217d954SCole Faust#if M0 > 1
6362*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6363*c217d954SCole Faust        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
6364*c217d954SCole Faust#endif
6365*c217d954SCole Faust#if M0 > 2
6366*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6367*c217d954SCole Faust        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
6368*c217d954SCole Faust#endif
6369*c217d954SCole Faust#if M0 > 3
6370*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6371*c217d954SCole Faust        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
6372*c217d954SCole Faust#endif
6373*c217d954SCole Faust#if M0 > 4
6374*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6375*c217d954SCole Faust        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
6376*c217d954SCole Faust#endif
6377*c217d954SCole Faust#if M0 > 5
6378*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6379*c217d954SCole Faust        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
6380*c217d954SCole Faust#endif
6381*c217d954SCole Faust#if M0 > 6
6382*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6383*c217d954SCole Faust        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
6384*c217d954SCole Faust#endif
6385*c217d954SCole Faust#if M0 > 7
6386*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
6387*c217d954SCole Faust        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
6388*c217d954SCole Faust#endif
6389*c217d954SCole Faust
6390*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, N0)
6391*c217d954SCole Faust        b0;
6392*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
6393*c217d954SCole Faust
6394*c217d954SCole Faust        VFMA_M0xN0(0, a, b0, c);
6395*c217d954SCole Faust
6396*c217d954SCole Faust        lhs_offset += sizeof(DATA_TYPE);
6397*c217d954SCole Faust        x_rhs += RHS_STEP_X;
6398*c217d954SCole Faust    }
6399*c217d954SCole Faust
6400*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
6401*c217d954SCole Faust
6402*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0);
6403*c217d954SCole Faust
6404*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
6405*c217d954SCole Faust
6406*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
6407*c217d954SCole Faust
6408*c217d954SCole Faust
6409*c217d954SCole Faust
6410*c217d954SCole Faust    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
6411*c217d954SCole Faust
6412*c217d954SCole Faust#else
6413*c217d954SCole Faust
6414*c217d954SCole Faust
6415*c217d954SCole Faust    dst_addr += z * dst_stride_z;
6416*c217d954SCole Faust
6417*c217d954SCole Faust#endif
6418*c217d954SCole Faust
6419*c217d954SCole Faust
6420*c217d954SCole Faust#if defined(ALPHA)
6421*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
6422*c217d954SCole Faust#endif
6423*c217d954SCole Faust
6424*c217d954SCole Faust
6425*c217d954SCole Faust#if defined(BETA)
6426*c217d954SCole Faust#if defined(BROADCAST_BIAS)
6427*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
6428*c217d954SCole Faust
6429*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
6430*c217d954SCole Faust
6431*c217d954SCole Faust#ifndef UNIT_BETA
6432*c217d954SCole Faust    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
6433*c217d954SCole Faust#endif
6434*c217d954SCole Faust
6435*c217d954SCole Faust
6436*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias0);
6437*c217d954SCole Faust
6438*c217d954SCole Faust#else
6439*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
6440*c217d954SCole Faust
6441*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6442*c217d954SCole Faust
6443*c217d954SCole Faust#ifndef UNIT_BETA
6444*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
6445*c217d954SCole Faust#endif
6446*c217d954SCole Faust
6447*c217d954SCole Faust
6448*c217d954SCole Faust    ADD_BLOCK(M0, c, bias);
6449*c217d954SCole Faust
6450*c217d954SCole Faust#endif
6451*c217d954SCole Faust#endif
6452*c217d954SCole Faust
6453*c217d954SCole Faust#if defined(ACTIVATION_TYPE)
6454*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
6455*c217d954SCole Faust#endif
6456*c217d954SCole Faust
6457*c217d954SCole Faust
6458*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6459*c217d954SCole Faust
6460*c217d954SCole Faust#undef RHS_BLOCK_SIZE
6461*c217d954SCole Faust#undef RHS_OFFSET_X
6462*c217d954SCole Faust#undef RHS_STEP_X
6463*c217d954SCole Faust#undef RHS_STEP_LOOP
6464*c217d954SCole Faust}
6465*c217d954SCole Faust#endif
6466*c217d954SCole Faust#endif
6467*c217d954SCole Faust
6468*c217d954SCole Faust#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
6469*c217d954SCole Faust
6470*c217d954SCole Faust#if defined(MIXED_PRECISION)
6471*c217d954SCole Faust#if K0 == 2
6472*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \
6473*c217d954SCole Faust    ({                      \
6474*c217d954SCole Faust        c += a.s0 * b.s0;   \
6475*c217d954SCole Faust        c += a.s1 * b.s1;   \
6476*c217d954SCole Faust    })
6477*c217d954SCole Faust#elif K0 == 3
6478*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \
6479*c217d954SCole Faust    ({                      \
6480*c217d954SCole Faust        c += a.s0 * b.s0;   \
6481*c217d954SCole Faust        c += a.s1 * b.s1;   \
6482*c217d954SCole Faust        c += a.s2 * b.s2;   \
6483*c217d954SCole Faust    })
6484*c217d954SCole Faust#elif K0 == 4
6485*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \
6486*c217d954SCole Faust    ({                      \
6487*c217d954SCole Faust        c += a.s0 * b.s0;   \
6488*c217d954SCole Faust        c += a.s1 * b.s1;   \
6489*c217d954SCole Faust        c += a.s2 * b.s2;   \
6490*c217d954SCole Faust        c += a.s3 * b.s3;   \
6491*c217d954SCole Faust    })
6492*c217d954SCole Faust#elif K0 == 8
6493*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \
6494*c217d954SCole Faust    ({                      \
6495*c217d954SCole Faust        c += a.s0 * b.s0;   \
6496*c217d954SCole Faust        c += a.s1 * b.s1;   \
6497*c217d954SCole Faust        c += a.s2 * b.s2;   \
6498*c217d954SCole Faust        c += a.s3 * b.s3;   \
6499*c217d954SCole Faust        c += a.s4 * b.s4;   \
6500*c217d954SCole Faust        c += a.s5 * b.s5;   \
6501*c217d954SCole Faust        c += a.s6 * b.s6;   \
6502*c217d954SCole Faust        c += a.s7 * b.s7;   \
6503*c217d954SCole Faust    })
6504*c217d954SCole Faust#elif K0 == 16
6505*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \
6506*c217d954SCole Faust    ({                      \
6507*c217d954SCole Faust        c += a.s0 * b.s0;   \
6508*c217d954SCole Faust        c += a.s1 * b.s1;   \
6509*c217d954SCole Faust        c += a.s2 * b.s2;   \
6510*c217d954SCole Faust        c += a.s3 * b.s3;   \
6511*c217d954SCole Faust        c += a.s4 * b.s4;   \
6512*c217d954SCole Faust        c += a.s5 * b.s5;   \
6513*c217d954SCole Faust        c += a.s6 * b.s6;   \
6514*c217d954SCole Faust        c += a.s7 * b.s7;   \
6515*c217d954SCole Faust        c += a.s8 * b.s8;   \
6516*c217d954SCole Faust        c += a.s9 * b.s9;   \
6517*c217d954SCole Faust        c += a.sA * b.sA;   \
6518*c217d954SCole Faust        c += a.sB * b.sB;   \
6519*c217d954SCole Faust        c += a.sC * b.sC;   \
6520*c217d954SCole Faust        c += a.sD * b.sD;   \
6521*c217d954SCole Faust        c += a.sE * b.sE;   \
6522*c217d954SCole Faust        c += a.sF * b.sF;   \
6523*c217d954SCole Faust    })
6524*c217d954SCole Faust#else
6525*c217d954SCole Faust#error "K0 value not supported"
6526*c217d954SCole Faust#endif
6527*c217d954SCole Faust#else
6528*c217d954SCole Faust#if K0 == 2
6529*c217d954SCole Faust#define ARM_DOT_K0(a, b, c)     \
6530*c217d954SCole Faust    ({                          \
6531*c217d954SCole Faust        c = fma(a.s0, b.s0, c); \
6532*c217d954SCole Faust        c = fma(a.s1, b.s1, c); \
6533*c217d954SCole Faust    })
6534*c217d954SCole Faust#elif K0 == 3
6535*c217d954SCole Faust#define ARM_DOT_K0(a, b, c)     \
6536*c217d954SCole Faust    ({                          \
6537*c217d954SCole Faust        c = fma(a.s0, b.s0, c); \
6538*c217d954SCole Faust        c = fma(a.s1, b.s1, c); \
6539*c217d954SCole Faust        c = fma(a.s2, b.s2, c); \
6540*c217d954SCole Faust    })
6541*c217d954SCole Faust#elif K0 == 4
6542*c217d954SCole Faust#define ARM_DOT_K0(a, b, c)     \
6543*c217d954SCole Faust    ({                          \
6544*c217d954SCole Faust        c = fma(a.s0, b.s0, c); \
6545*c217d954SCole Faust        c = fma(a.s1, b.s1, c); \
6546*c217d954SCole Faust        c = fma(a.s2, b.s2, c); \
6547*c217d954SCole Faust        c = fma(a.s3, b.s3, c); \
6548*c217d954SCole Faust    })
6549*c217d954SCole Faust#elif K0 == 8
6550*c217d954SCole Faust#define ARM_DOT_K0(a, b, c)     \
6551*c217d954SCole Faust    ({                          \
6552*c217d954SCole Faust        c = fma(a.s0, b.s0, c); \
6553*c217d954SCole Faust        c = fma(a.s1, b.s1, c); \
6554*c217d954SCole Faust        c = fma(a.s2, b.s2, c); \
6555*c217d954SCole Faust        c = fma(a.s3, b.s3, c); \
6556*c217d954SCole Faust        c = fma(a.s4, b.s4, c); \
6557*c217d954SCole Faust        c = fma(a.s5, b.s5, c); \
6558*c217d954SCole Faust        c = fma(a.s6, b.s6, c); \
6559*c217d954SCole Faust        c = fma(a.s7, b.s7, c); \
6560*c217d954SCole Faust    })
6561*c217d954SCole Faust#elif K0 == 16
6562*c217d954SCole Faust#define ARM_DOT_K0(a, b, c)     \
6563*c217d954SCole Faust    ({                          \
6564*c217d954SCole Faust        c = fma(a.s0, b.s0, c); \
6565*c217d954SCole Faust        c = fma(a.s1, b.s1, c); \
6566*c217d954SCole Faust        c = fma(a.s2, b.s2, c); \
6567*c217d954SCole Faust        c = fma(a.s3, b.s3, c); \
6568*c217d954SCole Faust        c = fma(a.s4, b.s4, c); \
6569*c217d954SCole Faust        c = fma(a.s5, b.s5, c); \
6570*c217d954SCole Faust        c = fma(a.s6, b.s6, c); \
6571*c217d954SCole Faust        c = fma(a.s7, b.s7, c); \
6572*c217d954SCole Faust        c = fma(a.s8, b.s8, c); \
6573*c217d954SCole Faust        c = fma(a.s9, b.s9, c); \
6574*c217d954SCole Faust        c = fma(a.sA, b.sA, c); \
6575*c217d954SCole Faust        c = fma(a.sB, b.sB, c); \
6576*c217d954SCole Faust        c = fma(a.sC, b.sC, c); \
6577*c217d954SCole Faust        c = fma(a.sD, b.sD, c); \
6578*c217d954SCole Faust        c = fma(a.sE, b.sE, c); \
6579*c217d954SCole Faust        c = fma(a.sF, b.sF, c); \
6580*c217d954SCole Faust    })
6581*c217d954SCole Faust#else
6582*c217d954SCole Faust#error "K0 value not supported"
6583*c217d954SCole Faust#endif
6584*c217d954SCole Faust#endif
6585*c217d954SCole Faust
6586*c217d954SCole Faust#if defined(ARM_DOT_K0XN0)
6587*c217d954SCole Faust#undef ARM_DOT_K0XN0
6588*c217d954SCole Faust#endif
6589*c217d954SCole Faust
6590*c217d954SCole Faust#if N0 == 2
6591*c217d954SCole Faust#define ARM_DOT_K0XN0(a, b, c)           \
6592*c217d954SCole Faust    ({                                   \
6593*c217d954SCole Faust        ARM_DOT_K0((a), (b##0), (c.s0)); \
6594*c217d954SCole Faust        ARM_DOT_K0((a), (b##1), (c.s1)); \
6595*c217d954SCole Faust    })
6596*c217d954SCole Faust#elif N0 == 3
6597*c217d954SCole Faust#define ARM_DOT_K0XN0(a, b, c)           \
6598*c217d954SCole Faust    ({                                   \
6599*c217d954SCole Faust        ARM_DOT_K0((a), (b##0), (c.s0)); \
6600*c217d954SCole Faust        ARM_DOT_K0((a), (b##1), (c.s1)); \
6601*c217d954SCole Faust        ARM_DOT_K0((a), (b##2), (c.s2)); \
6602*c217d954SCole Faust    })
6603*c217d954SCole Faust#elif N0 == 4
6604*c217d954SCole Faust#define ARM_DOT_K0XN0(a, b, c)           \
6605*c217d954SCole Faust    ({                                   \
6606*c217d954SCole Faust        ARM_DOT_K0((a), (b##0), (c.s0)); \
6607*c217d954SCole Faust        ARM_DOT_K0((a), (b##1), (c.s1)); \
6608*c217d954SCole Faust        ARM_DOT_K0((a), (b##2), (c.s2)); \
6609*c217d954SCole Faust        ARM_DOT_K0((a), (b##3), (c.s3)); \
6610*c217d954SCole Faust    })
6611*c217d954SCole Faust#elif N0 == 8
6612*c217d954SCole Faust#define ARM_DOT_K0XN0(a, b, c)           \
6613*c217d954SCole Faust    ({                                   \
6614*c217d954SCole Faust        ARM_DOT_K0((a), (b##0), (c.s0)); \
6615*c217d954SCole Faust        ARM_DOT_K0((a), (b##1), (c.s1)); \
6616*c217d954SCole Faust        ARM_DOT_K0((a), (b##2), (c.s2)); \
6617*c217d954SCole Faust        ARM_DOT_K0((a), (b##3), (c.s3)); \
6618*c217d954SCole Faust        ARM_DOT_K0((a), (b##4), (c.s4)); \
6619*c217d954SCole Faust        ARM_DOT_K0((a), (b##5), (c.s5)); \
6620*c217d954SCole Faust        ARM_DOT_K0((a), (b##6), (c.s6)); \
6621*c217d954SCole Faust        ARM_DOT_K0((a), (b##7), (c.s7)); \
6622*c217d954SCole Faust    })
6623*c217d954SCole Faust#elif N0 == 16
6624*c217d954SCole Faust#define ARM_DOT_K0XN0(a, b, c)           \
6625*c217d954SCole Faust    ({                                   \
6626*c217d954SCole Faust        ARM_DOT_K0((a), (b##0), (c.s0)); \
6627*c217d954SCole Faust        ARM_DOT_K0((a), (b##1), (c.s1)); \
6628*c217d954SCole Faust        ARM_DOT_K0((a), (b##2), (c.s2)); \
6629*c217d954SCole Faust        ARM_DOT_K0((a), (b##3), (c.s3)); \
6630*c217d954SCole Faust        ARM_DOT_K0((a), (b##4), (c.s4)); \
6631*c217d954SCole Faust        ARM_DOT_K0((a), (b##5), (c.s5)); \
6632*c217d954SCole Faust        ARM_DOT_K0((a), (b##6), (c.s6)); \
6633*c217d954SCole Faust        ARM_DOT_K0((a), (b##7), (c.s7)); \
6634*c217d954SCole Faust        ARM_DOT_K0((a), (b##8), (c.s8)); \
6635*c217d954SCole Faust        ARM_DOT_K0((a), (b##9), (c.s9)); \
6636*c217d954SCole Faust        ARM_DOT_K0((a), (b##A), (c.sA)); \
6637*c217d954SCole Faust        ARM_DOT_K0((a), (b##B), (c.sB)); \
6638*c217d954SCole Faust        ARM_DOT_K0((a), (b##C), (c.sC)); \
6639*c217d954SCole Faust        ARM_DOT_K0((a), (b##D), (c.sD)); \
6640*c217d954SCole Faust        ARM_DOT_K0((a), (b##E), (c.sE)); \
6641*c217d954SCole Faust        ARM_DOT_K0((a), (b##F), (c.sF)); \
6642*c217d954SCole Faust    })
6643*c217d954SCole Faust#else
6644*c217d954SCole Faust#error "N0 value not supported"
6645*c217d954SCole Faust#endif
6646*c217d954SCole Faust
6647*c217d954SCole Faust#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)
6648*c217d954SCole Faust
6649*c217d954SCole Faust__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
6650*c217d954SCole Faust                                            IMAGE_DECLARATION(rhs),
6651*c217d954SCole Faust#if defined(BETA)
6652*c217d954SCole Faust                                            IMAGE_DECLARATION(bias),
6653*c217d954SCole Faust#endif
6654*c217d954SCole Faust                                            IMAGE_DECLARATION(dst),
6655*c217d954SCole Faust                                            uint lhs_stride_z,
6656*c217d954SCole Faust                                            uint rhs_stride_z,
6657*c217d954SCole Faust#if defined(BETA)
6658*c217d954SCole Faust                                            uint bias_stride_z,
6659*c217d954SCole Faust#endif
6660*c217d954SCole Faust                                            uint dst_stride_z
6661*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
6662*c217d954SCole Faust                                            ,
6663*c217d954SCole Faust                                            uint dst_cross_plane_pad
6664*c217d954SCole Faust#endif
6665*c217d954SCole Faust                                            ,
6666*c217d954SCole Faust                                            const int M,
6667*c217d954SCole Faust                                            const int N,
6668*c217d954SCole Faust                                            const int K)
6669*c217d954SCole Faust{
6670*c217d954SCole Faust
6671*c217d954SCole Faust#define LHS_BLOCK_SIZE ((K0) * (M0))
6672*c217d954SCole Faust
6673*c217d954SCole Faust#if defined(LHS_INTERLEAVE)
6674*c217d954SCole Faust#define LHS_OFFSET_X (K0)
6675*c217d954SCole Faust#define LHS_STEP_X ((K0) * (V0))
6676*c217d954SCole Faust#define LHS_STEP_LOOP (1)
6677*c217d954SCole Faust#else
6678*c217d954SCole Faust#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
6679*c217d954SCole Faust#define LHS_STEP_X (K0)
6680*c217d954SCole Faust#define LHS_STEP_LOOP (V0)
6681*c217d954SCole Faust#endif
6682*c217d954SCole Faust
6683*c217d954SCole Faust
6684*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0))
6685*c217d954SCole Faust
6686*c217d954SCole Faust
6687*c217d954SCole Faust#if defined(RHS_INTERLEAVE)
6688*c217d954SCole Faust#define RHS_OFFSET_X (K0)
6689*c217d954SCole Faust#define RHS_STEP_X ((K0) * (H0))
6690*c217d954SCole Faust#define RHS_STEP_LOOP (1)
6691*c217d954SCole Faust#else
6692*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
6693*c217d954SCole Faust#define RHS_STEP_X (K0)
6694*c217d954SCole Faust#define RHS_STEP_LOOP (H0)
6695*c217d954SCole Faust#endif
6696*c217d954SCole Faust
6697*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
6698*c217d954SCole Faust    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
6699*c217d954SCole Faust    {
6700*c217d954SCole Faust        return;
6701*c217d954SCole Faust    }
6702*c217d954SCole Faust#endif
6703*c217d954SCole Faust
6704*c217d954SCole Faust
6705*c217d954SCole Faust    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
6706*c217d954SCole Faust                               (get_global_id(2) * lhs_stride_z);
6707*c217d954SCole Faust
6708*c217d954SCole Faust
6709*c217d954SCole Faust    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
6710*c217d954SCole Faust
6711*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
6712*c217d954SCole Faust
6713*c217d954SCole Faust    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
6714*c217d954SCole Faust#else
6715*c217d954SCole Faust    rhs_addr += get_global_id(2) * rhs_stride_z;
6716*c217d954SCole Faust#endif
6717*c217d954SCole Faust
6718*c217d954SCole Faust
6719*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
6720*c217d954SCole Faust
6721*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
6722*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
6723*c217d954SCole Faust
6724*c217d954SCole Faust    for(int i = 0; i < K; i += K0)
6725*c217d954SCole Faust    {
6726*c217d954SCole Faust
6727*c217d954SCole Faust
6728*c217d954SCole Faust
6729*c217d954SCole Faust
6730*c217d954SCole Faust
6731*c217d954SCole Faust
6732*c217d954SCole Faust
6733*c217d954SCole Faust
6734*c217d954SCole Faust
6735*c217d954SCole Faust
6736*c217d954SCole Faust        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
6737*c217d954SCole Faust
6738*c217d954SCole Faust
6739*c217d954SCole Faust        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
6740*c217d954SCole Faust
6741*c217d954SCole Faust
6742*c217d954SCole Faust        ARM_DOT_K0XN0(a0, b, c0);
6743*c217d954SCole Faust#if M0 > 1
6744*c217d954SCole Faust        ARM_DOT_K0XN0(a1, b, c1);
6745*c217d954SCole Faust#endif
6746*c217d954SCole Faust#if M0 > 2
6747*c217d954SCole Faust        ARM_DOT_K0XN0(a2, b, c2);
6748*c217d954SCole Faust#endif
6749*c217d954SCole Faust#if M0 > 3
6750*c217d954SCole Faust        ARM_DOT_K0XN0(a3, b, c3);
6751*c217d954SCole Faust#endif
6752*c217d954SCole Faust#if M0 > 4
6753*c217d954SCole Faust        ARM_DOT_K0XN0(a4, b, c4);
6754*c217d954SCole Faust#endif
6755*c217d954SCole Faust#if M0 > 5
6756*c217d954SCole Faust        ARM_DOT_K0XN0(a5, b, c5);
6757*c217d954SCole Faust#endif
6758*c217d954SCole Faust#if M0 > 6
6759*c217d954SCole Faust        ARM_DOT_K0XN0(a6, b, c6);
6760*c217d954SCole Faust#endif
6761*c217d954SCole Faust#if M0 > 7
6762*c217d954SCole Faust        ARM_DOT_K0XN0(a7, b, c7);
6763*c217d954SCole Faust#endif
6764*c217d954SCole Faust
6765*c217d954SCole Faust        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
6766*c217d954SCole Faust        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
6767*c217d954SCole Faust    }
6768*c217d954SCole Faust
6769*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
6770*c217d954SCole Faust
6771*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
6772*c217d954SCole Faust
6773*c217d954SCole Faust    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
6774*c217d954SCole Faust    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
6775*c217d954SCole Faust
6776*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
6777*c217d954SCole Faust
6778*c217d954SCole Faust
6779*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
6780*c217d954SCole Faust
6781*c217d954SCole Faust
6782*c217d954SCole Faust    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
6783*c217d954SCole Faust
6784*c217d954SCole Faust#else
6785*c217d954SCole Faust
6786*c217d954SCole Faust
6787*c217d954SCole Faust    dst_addr += get_global_id(2) * dst_stride_z;
6788*c217d954SCole Faust
6789*c217d954SCole Faust#endif
6790*c217d954SCole Faust
6791*c217d954SCole Faust
6792*c217d954SCole Faust#if defined(ALPHA)
6793*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
6794*c217d954SCole Faust#endif
6795*c217d954SCole Faust
6796*c217d954SCole Faust
6797*c217d954SCole Faust#if defined(BETA)
6798*c217d954SCole Faust#if defined(BROADCAST_BIAS)
6799*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
6800*c217d954SCole Faust
6801*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
6802*c217d954SCole Faust
6803*c217d954SCole Faust#ifndef UNIT_BETA
6804*c217d954SCole Faust    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
6805*c217d954SCole Faust#endif
6806*c217d954SCole Faust
6807*c217d954SCole Faust
6808*c217d954SCole Faust#if defined(MIXED_PRECISION)
6809*c217d954SCole Faust    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
6810*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
6811*c217d954SCole Faust#else
6812*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias0);
6813*c217d954SCole Faust#endif
6814*c217d954SCole Faust
6815*c217d954SCole Faust#else
6816*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
6817*c217d954SCole Faust                                    2) * bias_stride_z;
6818*c217d954SCole Faust
6819*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6820*c217d954SCole Faust
6821*c217d954SCole Faust#ifndef UNIT_BETA
6822*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
6823*c217d954SCole Faust#endif
6824*c217d954SCole Faust
6825*c217d954SCole Faust
6826*c217d954SCole Faust#if defined(MIXED_PRECISION)
6827*c217d954SCole Faust    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
6828*c217d954SCole Faust    ADD_BLOCK(M0, c, bias_hp);
6829*c217d954SCole Faust#else
6830*c217d954SCole Faust    ADD_BLOCK(M0, c, bias);
6831*c217d954SCole Faust#endif
6832*c217d954SCole Faust
6833*c217d954SCole Faust#endif
6834*c217d954SCole Faust#endif
6835*c217d954SCole Faust
6836*c217d954SCole Faust#if defined(ACTIVATION_TYPE)
6837*c217d954SCole Faust#if defined(MIXED_PRECISION)
6838*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
6839*c217d954SCole Faust#else
6840*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
6841*c217d954SCole Faust#endif
6842*c217d954SCole Faust#endif
6843*c217d954SCole Faust
6844*c217d954SCole Faust
6845*c217d954SCole Faust#if defined(MIXED_PRECISION)
6846*c217d954SCole Faust    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
6847*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6848*c217d954SCole Faust#else
6849*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6850*c217d954SCole Faust#endif
6851*c217d954SCole Faust
6852*c217d954SCole Faust#undef LHS_BLOCK_SIZE
6853*c217d954SCole Faust#undef LHS_OFFSET_X
6854*c217d954SCole Faust#undef LHS_STEP_X
6855*c217d954SCole Faust#undef RHS_BLOCK_SIZE
6856*c217d954SCole Faust#undef RHS_OFFSET_X
6857*c217d954SCole Faust#undef RHS_STEP_X
6858*c217d954SCole Faust#undef LHS_STEP_LOOP
6859*c217d954SCole Faust#undef RHS_STEP_LOOP
6860*c217d954SCole Faust}
6861*c217d954SCole Faust#endif
6862*c217d954SCole Faust
6863*c217d954SCole Faust#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)
6864*c217d954SCole Faust
6865*c217d954SCole Faust__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
6866*c217d954SCole Faust                                                    __read_only image2d_t rhs_img,
6867*c217d954SCole Faust#if defined(BETA)
6868*c217d954SCole Faust                                                    IMAGE_DECLARATION(bias),
6869*c217d954SCole Faust#endif
6870*c217d954SCole Faust                                                    IMAGE_DECLARATION(dst),
6871*c217d954SCole Faust                                                    uint lhs_stride_z,
6872*c217d954SCole Faust                                                    uint rhs_stride_z,
6873*c217d954SCole Faust#if defined(BETA)
6874*c217d954SCole Faust                                                    uint bias_stride_z,
6875*c217d954SCole Faust#endif
6876*c217d954SCole Faust                                                    uint dst_stride_z
6877*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
6878*c217d954SCole Faust                                                    ,
6879*c217d954SCole Faust                                                    uint dst_cross_plane_pad
6880*c217d954SCole Faust#endif
6881*c217d954SCole Faust                                                    ,
6882*c217d954SCole Faust                                                    const int M,
6883*c217d954SCole Faust                                                    const int N,
6884*c217d954SCole Faust                                                    const int K)
6885*c217d954SCole Faust{
6886*c217d954SCole Faust
6887*c217d954SCole Faust#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
6888*c217d954SCole Faust
6889*c217d954SCole Faust
6890*c217d954SCole Faust#define LHS_BLOCK_SIZE ((K0) * (M0))
6891*c217d954SCole Faust
6892*c217d954SCole Faust#if defined(LHS_INTERLEAVE)
6893*c217d954SCole Faust#define LHS_OFFSET_X (K0)
6894*c217d954SCole Faust#define LHS_STEP_X ((K0) * (V0))
6895*c217d954SCole Faust#define LHS_STEP_LOOP (1)
6896*c217d954SCole Faust#else
6897*c217d954SCole Faust#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
6898*c217d954SCole Faust#define LHS_STEP_X (K0)
6899*c217d954SCole Faust#define LHS_STEP_LOOP (V0)
6900*c217d954SCole Faust#endif
6901*c217d954SCole Faust
6902*c217d954SCole Faust
6903*c217d954SCole Faust#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
6904*c217d954SCole Faust
6905*c217d954SCole Faust
6906*c217d954SCole Faust#if defined(RHS_INTERLEAVE)
6907*c217d954SCole Faust#define RHS_OFFSET_X (PIXEL_UNIT)
6908*c217d954SCole Faust#define RHS_STEP_X (PIXEL_UNIT * (H0))
6909*c217d954SCole Faust#define RHS_STEP_LOOP (1)
6910*c217d954SCole Faust#else
6911*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
6912*c217d954SCole Faust#define RHS_STEP_X PIXEL_UNIT
6913*c217d954SCole Faust#define RHS_STEP_LOOP (H0)
6914*c217d954SCole Faust#endif
6915*c217d954SCole Faust
6916*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
6917*c217d954SCole Faust    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
6918*c217d954SCole Faust    {
6919*c217d954SCole Faust        return;
6920*c217d954SCole Faust    }
6921*c217d954SCole Faust#endif
6922*c217d954SCole Faust
6923*c217d954SCole Faust
6924*c217d954SCole Faust    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
6925*c217d954SCole Faust                               (get_global_id(2) * lhs_stride_z);
6926*c217d954SCole Faust
6927*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
6928*c217d954SCole Faust
6929*c217d954SCole Faust    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
6930*c217d954SCole Faust#else
6931*c217d954SCole Faust    const uint z_rhs = get_global_id(2);
6932*c217d954SCole Faust#endif
6933*c217d954SCole Faust
6934*c217d954SCole Faust
6935*c217d954SCole Faust    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
6936*c217d954SCole Faust    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
6937*c217d954SCole Faust
6938*c217d954SCole Faust
6939*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
6940*c217d954SCole Faust
6941*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
6942*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
6943*c217d954SCole Faust
6944*c217d954SCole Faust    for(int i = 0; i < K; i += K0)
6945*c217d954SCole Faust    {
6946*c217d954SCole Faust
6947*c217d954SCole Faust        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
6948*c217d954SCole Faust
6949*c217d954SCole Faust
6950*c217d954SCole Faust        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
6951*c217d954SCole Faust        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
6952*c217d954SCole Faust
6953*c217d954SCole Faust
6954*c217d954SCole Faust        ARM_DOT_K0XN0(a0, b, c0);
6955*c217d954SCole Faust#if M0 > 1
6956*c217d954SCole Faust        ARM_DOT_K0XN0(a1, b, c1);
6957*c217d954SCole Faust#endif
6958*c217d954SCole Faust#if M0 > 2
6959*c217d954SCole Faust        ARM_DOT_K0XN0(a2, b, c2);
6960*c217d954SCole Faust#endif
6961*c217d954SCole Faust#if M0 > 3
6962*c217d954SCole Faust        ARM_DOT_K0XN0(a3, b, c3);
6963*c217d954SCole Faust#endif
6964*c217d954SCole Faust#if M0 > 4
6965*c217d954SCole Faust        ARM_DOT_K0XN0(a4, b, c4);
6966*c217d954SCole Faust#endif
6967*c217d954SCole Faust#if M0 > 5
6968*c217d954SCole Faust        ARM_DOT_K0XN0(a5, b, c5);
6969*c217d954SCole Faust#endif
6970*c217d954SCole Faust#if M0 > 6
6971*c217d954SCole Faust        ARM_DOT_K0XN0(a6, b, c6);
6972*c217d954SCole Faust#endif
6973*c217d954SCole Faust#if M0 > 7
6974*c217d954SCole Faust        ARM_DOT_K0XN0(a7, b, c7);
6975*c217d954SCole Faust#endif
6976*c217d954SCole Faust
6977*c217d954SCole Faust        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
6978*c217d954SCole Faust
6979*c217d954SCole Faust        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
6980*c217d954SCole Faust    }
6981*c217d954SCole Faust
6982*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
6983*c217d954SCole Faust
6984*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
6985*c217d954SCole Faust
6986*c217d954SCole Faust    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
6987*c217d954SCole Faust    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
6988*c217d954SCole Faust
6989*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
6990*c217d954SCole Faust
6991*c217d954SCole Faust
6992*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
6993*c217d954SCole Faust
6994*c217d954SCole Faust
6995*c217d954SCole Faust    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
6996*c217d954SCole Faust
6997*c217d954SCole Faust#else
6998*c217d954SCole Faust
6999*c217d954SCole Faust
7000*c217d954SCole Faust    dst_addr += get_global_id(2) * dst_stride_z;
7001*c217d954SCole Faust
7002*c217d954SCole Faust#endif
7003*c217d954SCole Faust
7004*c217d954SCole Faust
7005*c217d954SCole Faust#if defined(ALPHA)
7006*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
7007*c217d954SCole Faust#endif
7008*c217d954SCole Faust
7009*c217d954SCole Faust
7010*c217d954SCole Faust#if defined(BETA)
7011*c217d954SCole Faust#if defined(BROADCAST_BIAS)
7012*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
7013*c217d954SCole Faust
7014*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
7015*c217d954SCole Faust
7016*c217d954SCole Faust#ifndef UNIT_BETA
7017*c217d954SCole Faust    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
7018*c217d954SCole Faust#endif
7019*c217d954SCole Faust
7020*c217d954SCole Faust
7021*c217d954SCole Faust#if defined(MIXED_PRECISION)
7022*c217d954SCole Faust    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7023*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
7024*c217d954SCole Faust#else
7025*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias0);
7026*c217d954SCole Faust#endif
7027*c217d954SCole Faust
7028*c217d954SCole Faust#else
7029*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
7030*c217d954SCole Faust                                    2) * bias_stride_z;
7031*c217d954SCole Faust
7032*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7033*c217d954SCole Faust
7034*c217d954SCole Faust#ifndef UNIT_BETA
7035*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
7036*c217d954SCole Faust#endif
7037*c217d954SCole Faust
7038*c217d954SCole Faust
7039*c217d954SCole Faust#if defined(MIXED_PRECISION)
7040*c217d954SCole Faust    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7041*c217d954SCole Faust    ADD_BLOCK(M0, c, bias_hp);
7042*c217d954SCole Faust#else
7043*c217d954SCole Faust    ADD_BLOCK(M0, c, bias);
7044*c217d954SCole Faust#endif
7045*c217d954SCole Faust
7046*c217d954SCole Faust#endif
7047*c217d954SCole Faust#endif
7048*c217d954SCole Faust
7049*c217d954SCole Faust#if defined(ACTIVATION_TYPE)
7050*c217d954SCole Faust#if defined(MIXED_PRECISION)
7051*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
7052*c217d954SCole Faust#else
7053*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
7054*c217d954SCole Faust#endif
7055*c217d954SCole Faust#endif
7056*c217d954SCole Faust
7057*c217d954SCole Faust
7058*c217d954SCole Faust#if defined(MIXED_PRECISION)
7059*c217d954SCole Faust    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
7060*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7061*c217d954SCole Faust#else
7062*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7063*c217d954SCole Faust#endif
7064*c217d954SCole Faust
7065*c217d954SCole Faust#undef LHS_BLOCK_SIZE
7066*c217d954SCole Faust#undef LHS_OFFSET_X
7067*c217d954SCole Faust#undef LHS_STEP_X
7068*c217d954SCole Faust#undef RHS_BLOCK_SIZE
7069*c217d954SCole Faust#undef RHS_OFFSET_X
7070*c217d954SCole Faust#undef RHS_STEP_X
7071*c217d954SCole Faust#undef PIXEL_UNIT
7072*c217d954SCole Faust#undef LHS_STEP_LOOP
7073*c217d954SCole Faust#undef RHS_STEP_LOOP
7074*c217d954SCole Faust}
7075*c217d954SCole Faust#endif
7076*c217d954SCole Faust
7077*c217d954SCole Faust#if defined(LHS_TRANSPOSE)
7078*c217d954SCole Faust
7079*c217d954SCole Faust#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
7080*c217d954SCole Faust
7081*c217d954SCole Faust#if defined(MIXED_PRECISION)
7082*c217d954SCole Faust
7083*c217d954SCole Faust#if(GPU_ARCH == GPU_ARCH_MIDGARD)
7084*c217d954SCole Faust#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
7085*c217d954SCole Faust#else
7086*c217d954SCole Faust#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
7087*c217d954SCole Faust#endif
7088*c217d954SCole Faust
7089*c217d954SCole Faust#else
7090*c217d954SCole Faust
7091*c217d954SCole Faust#if(GPU_ARCH == GPU_ARCH_MIDGARD)
7092*c217d954SCole Faust#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
7093*c217d954SCole Faust#else
7094*c217d954SCole Faust#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
7095*c217d954SCole Faust#endif
7096*c217d954SCole Faust
7097*c217d954SCole Faust#endif
7098*c217d954SCole Faust
7099*c217d954SCole Faust#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
7100*c217d954SCole Faust    ({                                                 \
7101*c217d954SCole Faust        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
7102*c217d954SCole Faust    })
7103*c217d954SCole Faust#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
7104*c217d954SCole Faust    ({                                                    \
7105*c217d954SCole Faust        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
7106*c217d954SCole Faust        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
7107*c217d954SCole Faust    })
7108*c217d954SCole Faust#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
7109*c217d954SCole Faust    ({                                                    \
7110*c217d954SCole Faust        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
7111*c217d954SCole Faust        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
7112*c217d954SCole Faust    })
7113*c217d954SCole Faust#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
7114*c217d954SCole Faust    ({                                                    \
7115*c217d954SCole Faust        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
7116*c217d954SCole Faust        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
7117*c217d954SCole Faust    })
7118*c217d954SCole Faust#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
7119*c217d954SCole Faust    ({                                                    \
7120*c217d954SCole Faust        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
7121*c217d954SCole Faust        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
7122*c217d954SCole Faust        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
7123*c217d954SCole Faust        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
7124*c217d954SCole Faust        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
7125*c217d954SCole Faust    })
7126*c217d954SCole Faust
7127*c217d954SCole Faust
7128*c217d954SCole Faust
7129*c217d954SCole Faust
7130*c217d954SCole Faust
7131*c217d954SCole Faust
7132*c217d954SCole Faust
7133*c217d954SCole Faust#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
7134*c217d954SCole Faust
7135*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
7136*c217d954SCole Faust    ({                                                         \
7137*c217d954SCole Faust        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
7138*c217d954SCole Faust    })
7139*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
7140*c217d954SCole Faust    ({                                                         \
7141*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
7142*c217d954SCole Faust        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
7143*c217d954SCole Faust    })
7144*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
7145*c217d954SCole Faust    ({                                                         \
7146*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
7147*c217d954SCole Faust        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
7148*c217d954SCole Faust    })
7149*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
7150*c217d954SCole Faust    ({                                                         \
7151*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
7152*c217d954SCole Faust        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
7153*c217d954SCole Faust    })
7154*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
7155*c217d954SCole Faust    ({                                                         \
7156*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
7157*c217d954SCole Faust        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
7158*c217d954SCole Faust        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
7159*c217d954SCole Faust        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
7160*c217d954SCole Faust        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
7161*c217d954SCole Faust    })
7162*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
7163*c217d954SCole Faust    ({                                                        \
7164*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
7165*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
7166*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
7167*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
7168*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
7169*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
7170*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
7171*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
7172*c217d954SCole Faust        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
7173*c217d954SCole Faust    })
7174*c217d954SCole Faust
7175*c217d954SCole Faust
7176*c217d954SCole Faust
7177*c217d954SCole Faust
7178*c217d954SCole Faust
7179*c217d954SCole Faust
7180*c217d954SCole Faust
7181*c217d954SCole Faust
7182*c217d954SCole Faust
7183*c217d954SCole Faust#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
7184*c217d954SCole Faust    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
7185*c217d954SCole Faust    (M0, N0, TYPE, A, B, C)
7186*c217d954SCole Faust
7187*c217d954SCole Faust#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)
7188*c217d954SCole Faust
7189*c217d954SCole Faust__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
7190*c217d954SCole Faust                                            IMAGE_DECLARATION(rhs),
7191*c217d954SCole Faust#if defined(BETA)
7192*c217d954SCole Faust                                            IMAGE_DECLARATION(bias),
7193*c217d954SCole Faust#endif
7194*c217d954SCole Faust                                            IMAGE_DECLARATION(dst),
7195*c217d954SCole Faust                                            uint lhs_stride_z,
7196*c217d954SCole Faust                                            uint rhs_stride_z,
7197*c217d954SCole Faust#if defined(BETA)
7198*c217d954SCole Faust                                            uint bias_stride_z,
7199*c217d954SCole Faust#endif
7200*c217d954SCole Faust                                            uint dst_stride_z
7201*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
7202*c217d954SCole Faust                                            ,
7203*c217d954SCole Faust                                            uint dst_cross_plane_pad
7204*c217d954SCole Faust#endif
7205*c217d954SCole Faust                                            ,
7206*c217d954SCole Faust                                            const int M,
7207*c217d954SCole Faust                                            const int N,
7208*c217d954SCole Faust                                            const int K)
7209*c217d954SCole Faust{
7210*c217d954SCole Faust
7211*c217d954SCole Faust#define LHS_BLOCK_SIZE ((K0) * (M0))
7212*c217d954SCole Faust
7213*c217d954SCole Faust#if defined(LHS_INTERLEAVE)
7214*c217d954SCole Faust#define LHS_OFFSET_X (M0)
7215*c217d954SCole Faust#define LHS_STEP_X ((M0) * (V0))
7216*c217d954SCole Faust#define LHS_STEP_LOOP (1)
7217*c217d954SCole Faust#else
7218*c217d954SCole Faust#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
7219*c217d954SCole Faust#define LHS_STEP_X (M0)
7220*c217d954SCole Faust#define LHS_STEP_LOOP (V0)
7221*c217d954SCole Faust#endif
7222*c217d954SCole Faust
7223*c217d954SCole Faust
7224*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0))
7225*c217d954SCole Faust
7226*c217d954SCole Faust
7227*c217d954SCole Faust#if defined(RHS_INTERLEAVE)
7228*c217d954SCole Faust#define RHS_OFFSET_X (N0)
7229*c217d954SCole Faust#define RHS_STEP_X ((N0) * (H0))
7230*c217d954SCole Faust#else
7231*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
7232*c217d954SCole Faust#define RHS_STEP_X (N0)
7233*c217d954SCole Faust#endif
7234*c217d954SCole Faust
7235*c217d954SCole Faust    const uint x = get_global_id(0);
7236*c217d954SCole Faust    const uint y = get_global_id(1);
7237*c217d954SCole Faust    const uint z = get_global_id(2);
7238*c217d954SCole Faust
7239*c217d954SCole Faust    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
7240*c217d954SCole Faust    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
7241*c217d954SCole Faust
7242*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
7243*c217d954SCole Faust    if((x * N0 >= N) || (y * M0 >= M))
7244*c217d954SCole Faust    {
7245*c217d954SCole Faust        return;
7246*c217d954SCole Faust    }
7247*c217d954SCole Faust#endif
7248*c217d954SCole Faust
7249*c217d954SCole Faust
7250*c217d954SCole Faust    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
7251*c217d954SCole Faust
7252*c217d954SCole Faust
7253*c217d954SCole Faust    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
7254*c217d954SCole Faust
7255*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
7256*c217d954SCole Faust
7257*c217d954SCole Faust    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
7258*c217d954SCole Faust#else
7259*c217d954SCole Faust    rhs_addr += z * rhs_stride_z;
7260*c217d954SCole Faust#endif
7261*c217d954SCole Faust
7262*c217d954SCole Faust
7263*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
7264*c217d954SCole Faust
7265*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
7266*c217d954SCole Faust
7267*c217d954SCole Faust    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
7268*c217d954SCole Faust    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
7269*c217d954SCole Faust
7270*c217d954SCole Faust    for(int i = 0; i < K; i += K0)
7271*c217d954SCole Faust    {
7272*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, M0)
7273*c217d954SCole Faust        a0;
7274*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, N0)
7275*c217d954SCole Faust        b0;
7276*c217d954SCole Faust
7277*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7278*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7279*c217d954SCole Faust
7280*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7281*c217d954SCole Faust
7282*c217d954SCole Faust        lhs += LHS_STEP_X;
7283*c217d954SCole Faust        rhs += RHS_STEP_X;
7284*c217d954SCole Faust
7285*c217d954SCole Faust#if K0 > 1
7286*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7287*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7288*c217d954SCole Faust
7289*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7290*c217d954SCole Faust
7291*c217d954SCole Faust        lhs += LHS_STEP_X;
7292*c217d954SCole Faust        rhs += RHS_STEP_X;
7293*c217d954SCole Faust#endif
7294*c217d954SCole Faust
7295*c217d954SCole Faust#if K0 > 2
7296*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7297*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7298*c217d954SCole Faust
7299*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7300*c217d954SCole Faust
7301*c217d954SCole Faust        lhs += LHS_STEP_X;
7302*c217d954SCole Faust        rhs += RHS_STEP_X;
7303*c217d954SCole Faust#endif
7304*c217d954SCole Faust
7305*c217d954SCole Faust#if K0 > 3
7306*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7307*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7308*c217d954SCole Faust
7309*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7310*c217d954SCole Faust
7311*c217d954SCole Faust        lhs += LHS_STEP_X;
7312*c217d954SCole Faust        rhs += RHS_STEP_X;
7313*c217d954SCole Faust#endif
7314*c217d954SCole Faust
7315*c217d954SCole Faust#if K0 > 4
7316*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7317*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7318*c217d954SCole Faust
7319*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7320*c217d954SCole Faust
7321*c217d954SCole Faust        lhs += LHS_STEP_X;
7322*c217d954SCole Faust        rhs += RHS_STEP_X;
7323*c217d954SCole Faust
7324*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7325*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7326*c217d954SCole Faust
7327*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7328*c217d954SCole Faust
7329*c217d954SCole Faust        lhs += LHS_STEP_X;
7330*c217d954SCole Faust        rhs += RHS_STEP_X;
7331*c217d954SCole Faust
7332*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7333*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7334*c217d954SCole Faust
7335*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7336*c217d954SCole Faust
7337*c217d954SCole Faust        lhs += LHS_STEP_X;
7338*c217d954SCole Faust        rhs += RHS_STEP_X;
7339*c217d954SCole Faust
7340*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7341*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7342*c217d954SCole Faust
7343*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7344*c217d954SCole Faust
7345*c217d954SCole Faust        lhs += LHS_STEP_X;
7346*c217d954SCole Faust        rhs += RHS_STEP_X;
7347*c217d954SCole Faust#endif
7348*c217d954SCole Faust
7349*c217d954SCole Faust#if K0 > 8
7350*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7351*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7352*c217d954SCole Faust
7353*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7354*c217d954SCole Faust
7355*c217d954SCole Faust        lhs += LHS_STEP_X;
7356*c217d954SCole Faust        rhs += RHS_STEP_X;
7357*c217d954SCole Faust
7358*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7359*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7360*c217d954SCole Faust
7361*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7362*c217d954SCole Faust
7363*c217d954SCole Faust        lhs += LHS_STEP_X;
7364*c217d954SCole Faust        rhs += RHS_STEP_X;
7365*c217d954SCole Faust
7366*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7367*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7368*c217d954SCole Faust
7369*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7370*c217d954SCole Faust
7371*c217d954SCole Faust        lhs += LHS_STEP_X;
7372*c217d954SCole Faust        rhs += RHS_STEP_X;
7373*c217d954SCole Faust
7374*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7375*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7376*c217d954SCole Faust
7377*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7378*c217d954SCole Faust
7379*c217d954SCole Faust        lhs += LHS_STEP_X;
7380*c217d954SCole Faust        rhs += RHS_STEP_X;
7381*c217d954SCole Faust
7382*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7383*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7384*c217d954SCole Faust
7385*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7386*c217d954SCole Faust
7387*c217d954SCole Faust        lhs += LHS_STEP_X;
7388*c217d954SCole Faust        rhs += RHS_STEP_X;
7389*c217d954SCole Faust
7390*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7391*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7392*c217d954SCole Faust
7393*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7394*c217d954SCole Faust
7395*c217d954SCole Faust        lhs += LHS_STEP_X;
7396*c217d954SCole Faust        rhs += RHS_STEP_X;
7397*c217d954SCole Faust
7398*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7399*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7400*c217d954SCole Faust
7401*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7402*c217d954SCole Faust
7403*c217d954SCole Faust        lhs += LHS_STEP_X;
7404*c217d954SCole Faust        rhs += RHS_STEP_X;
7405*c217d954SCole Faust
7406*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7407*c217d954SCole Faust        b0 = VLOAD(N0)(0, rhs);
7408*c217d954SCole Faust
7409*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7410*c217d954SCole Faust
7411*c217d954SCole Faust        lhs += LHS_STEP_X;
7412*c217d954SCole Faust        rhs += RHS_STEP_X;
7413*c217d954SCole Faust#endif
7414*c217d954SCole Faust
7415*c217d954SCole Faust#ifndef LHS_INTERLEAVE
7416*c217d954SCole Faust        lhs += (M0 * K0 * (V0 - 1));
7417*c217d954SCole Faust#endif
7418*c217d954SCole Faust
7419*c217d954SCole Faust#ifndef RHS_INTERLEAVE
7420*c217d954SCole Faust        rhs += (N0 * K0 * (H0 - 1));
7421*c217d954SCole Faust#endif
7422*c217d954SCole Faust    }
7423*c217d954SCole Faust
7424*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
7425*c217d954SCole Faust
7426*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
7427*c217d954SCole Faust
7428*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
7429*c217d954SCole Faust
7430*c217d954SCole Faust
7431*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
7432*c217d954SCole Faust
7433*c217d954SCole Faust
7434*c217d954SCole Faust    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
7435*c217d954SCole Faust
7436*c217d954SCole Faust#else
7437*c217d954SCole Faust
7438*c217d954SCole Faust
7439*c217d954SCole Faust    dst_addr += z * dst_stride_z;
7440*c217d954SCole Faust
7441*c217d954SCole Faust#endif
7442*c217d954SCole Faust
7443*c217d954SCole Faust
7444*c217d954SCole Faust#if defined(ALPHA)
7445*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
7446*c217d954SCole Faust#endif
7447*c217d954SCole Faust
7448*c217d954SCole Faust
7449*c217d954SCole Faust#if defined(BETA)
7450*c217d954SCole Faust#if defined(BROADCAST_BIAS)
7451*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
7452*c217d954SCole Faust
7453*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
7454*c217d954SCole Faust
7455*c217d954SCole Faust#ifndef UNIT_BETA
7456*c217d954SCole Faust    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
7457*c217d954SCole Faust#endif
7458*c217d954SCole Faust
7459*c217d954SCole Faust
7460*c217d954SCole Faust#if defined(MIXED_PRECISION)
7461*c217d954SCole Faust    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7462*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
7463*c217d954SCole Faust#else
7464*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias0);
7465*c217d954SCole Faust#endif
7466*c217d954SCole Faust
7467*c217d954SCole Faust#else
7468*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
7469*c217d954SCole Faust                                    2) * bias_stride_z;
7470*c217d954SCole Faust
7471*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7472*c217d954SCole Faust
7473*c217d954SCole Faust#ifndef UNIT_BETA
7474*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
7475*c217d954SCole Faust#endif
7476*c217d954SCole Faust
7477*c217d954SCole Faust#if defined(MIXED_PRECISION)
7478*c217d954SCole Faust    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7479*c217d954SCole Faust    ADD_BLOCK(M0, c, bias_hp);
7480*c217d954SCole Faust#else
7481*c217d954SCole Faust    ADD_BLOCK(M0, c, bias);
7482*c217d954SCole Faust#endif
7483*c217d954SCole Faust
7484*c217d954SCole Faust#endif
7485*c217d954SCole Faust#endif
7486*c217d954SCole Faust
7487*c217d954SCole Faust#if defined(ACTIVATION_TYPE)
7488*c217d954SCole Faust#if defined(MIXED_PRECISION)
7489*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
7490*c217d954SCole Faust#else
7491*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
7492*c217d954SCole Faust#endif
7493*c217d954SCole Faust#endif
7494*c217d954SCole Faust
7495*c217d954SCole Faust
7496*c217d954SCole Faust#if defined(MIXED_PRECISION)
7497*c217d954SCole Faust    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
7498*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7499*c217d954SCole Faust#else
7500*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7501*c217d954SCole Faust#endif
7502*c217d954SCole Faust
7503*c217d954SCole Faust#undef LHS_BLOCK_SIZE
7504*c217d954SCole Faust#undef LHS_OFFSET_X
7505*c217d954SCole Faust#undef LHS_STEP_X
7506*c217d954SCole Faust#undef RHS_BLOCK_SIZE
7507*c217d954SCole Faust#undef RHS_OFFSET_X
7508*c217d954SCole Faust#undef RHS_STEP_X
7509*c217d954SCole Faust}
7510*c217d954SCole Faust#endif
7511*c217d954SCole Faust
7512*c217d954SCole Faust#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)
7513*c217d954SCole Faust
7514*c217d954SCole Faust__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
7515*c217d954SCole Faust                                                    __read_only image2d_t rhs_img,
7516*c217d954SCole Faust#if defined(BETA)
7517*c217d954SCole Faust                                                    IMAGE_DECLARATION(bias),
7518*c217d954SCole Faust#endif
7519*c217d954SCole Faust                                                    IMAGE_DECLARATION(dst),
7520*c217d954SCole Faust                                                    uint lhs_stride_z,
7521*c217d954SCole Faust                                                    uint rhs_stride_z,
7522*c217d954SCole Faust#if defined(BETA)
7523*c217d954SCole Faust                                                    uint bias_stride_z,
7524*c217d954SCole Faust#endif
7525*c217d954SCole Faust                                                    uint dst_stride_z
7526*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
7527*c217d954SCole Faust                                                    ,
7528*c217d954SCole Faust                                                    uint dst_cross_plane_pad
7529*c217d954SCole Faust#endif
7530*c217d954SCole Faust                                                    ,
7531*c217d954SCole Faust                                                    const int M,
7532*c217d954SCole Faust                                                    const int N,
7533*c217d954SCole Faust                                                    const int K)
7534*c217d954SCole Faust{
7535*c217d954SCole Faust
7536*c217d954SCole Faust#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
7537*c217d954SCole Faust
7538*c217d954SCole Faust
7539*c217d954SCole Faust#define LHS_BLOCK_SIZE ((K0) * (M0))
7540*c217d954SCole Faust
7541*c217d954SCole Faust#if defined(LHS_INTERLEAVE)
7542*c217d954SCole Faust#define LHS_OFFSET_X (M0)
7543*c217d954SCole Faust#define LHS_STEP_X ((M0) * (V0))
7544*c217d954SCole Faust#define LHS_STEP_LOOP (1)
7545*c217d954SCole Faust#else
7546*c217d954SCole Faust#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
7547*c217d954SCole Faust#define LHS_STEP_X (M0)
7548*c217d954SCole Faust#define LHS_STEP_LOOP (V0)
7549*c217d954SCole Faust#endif
7550*c217d954SCole Faust
7551*c217d954SCole Faust
7552*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
7553*c217d954SCole Faust
7554*c217d954SCole Faust
7555*c217d954SCole Faust#if defined(RHS_INTERLEAVE)
7556*c217d954SCole Faust#define RHS_OFFSET_X (PIXEL_UNIT)
7557*c217d954SCole Faust#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
7558*c217d954SCole Faust#else
7559*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
7560*c217d954SCole Faust#define RHS_STEP_X (PIXEL_UNIT)
7561*c217d954SCole Faust#endif
7562*c217d954SCole Faust
7563*c217d954SCole Faust    const uint x = get_global_id(0);
7564*c217d954SCole Faust    const uint y = get_global_id(1);
7565*c217d954SCole Faust    const uint z = get_global_id(2);
7566*c217d954SCole Faust
7567*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
7568*c217d954SCole Faust    if((x * N0 >= N) || (y * M0 >= M))
7569*c217d954SCole Faust    {
7570*c217d954SCole Faust        return;
7571*c217d954SCole Faust    }
7572*c217d954SCole Faust#endif
7573*c217d954SCole Faust
7574*c217d954SCole Faust
7575*c217d954SCole Faust    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
7576*c217d954SCole Faust
7577*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
7578*c217d954SCole Faust
7579*c217d954SCole Faust    const uint z_rhs = (z % MATRIX_B_DEPTH);
7580*c217d954SCole Faust#else
7581*c217d954SCole Faust    const uint z_rhs = z;
7582*c217d954SCole Faust#endif
7583*c217d954SCole Faust
7584*c217d954SCole Faust
7585*c217d954SCole Faust    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
7586*c217d954SCole Faust    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
7587*c217d954SCole Faust
7588*c217d954SCole Faust
7589*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
7590*c217d954SCole Faust
7591*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
7592*c217d954SCole Faust
7593*c217d954SCole Faust    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
7594*c217d954SCole Faust
7595*c217d954SCole Faust    for(int i = 0; i < K; i += K0)
7596*c217d954SCole Faust    {
7597*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, M0)
7598*c217d954SCole Faust        a0;
7599*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, N0)
7600*c217d954SCole Faust        b0;
7601*c217d954SCole Faust
7602*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7603*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
7604*c217d954SCole Faust
7605*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7606*c217d954SCole Faust
7607*c217d954SCole Faust        lhs += LHS_STEP_X;
7608*c217d954SCole Faust
7609*c217d954SCole Faust#if K0 > 1
7610*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7611*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
7612*c217d954SCole Faust
7613*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7614*c217d954SCole Faust
7615*c217d954SCole Faust        lhs += LHS_STEP_X;
7616*c217d954SCole Faust#endif
7617*c217d954SCole Faust
7618*c217d954SCole Faust#if K0 > 2
7619*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7620*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
7621*c217d954SCole Faust
7622*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7623*c217d954SCole Faust
7624*c217d954SCole Faust        lhs += LHS_STEP_X;
7625*c217d954SCole Faust#endif
7626*c217d954SCole Faust
7627*c217d954SCole Faust#if K0 > 3
7628*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7629*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
7630*c217d954SCole Faust
7631*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7632*c217d954SCole Faust
7633*c217d954SCole Faust        lhs += LHS_STEP_X;
7634*c217d954SCole Faust#endif
7635*c217d954SCole Faust
7636*c217d954SCole Faust#if K0 > 4
7637*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7638*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
7639*c217d954SCole Faust
7640*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7641*c217d954SCole Faust
7642*c217d954SCole Faust        lhs += LHS_STEP_X;
7643*c217d954SCole Faust
7644*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7645*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
7646*c217d954SCole Faust
7647*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7648*c217d954SCole Faust
7649*c217d954SCole Faust        lhs += LHS_STEP_X;
7650*c217d954SCole Faust
7651*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7652*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
7653*c217d954SCole Faust
7654*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7655*c217d954SCole Faust
7656*c217d954SCole Faust        lhs += LHS_STEP_X;
7657*c217d954SCole Faust
7658*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7659*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
7660*c217d954SCole Faust
7661*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7662*c217d954SCole Faust
7663*c217d954SCole Faust        lhs += LHS_STEP_X;
7664*c217d954SCole Faust#endif
7665*c217d954SCole Faust
7666*c217d954SCole Faust#if K0 > 8
7667*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7668*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
7669*c217d954SCole Faust
7670*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7671*c217d954SCole Faust
7672*c217d954SCole Faust        lhs += LHS_STEP_X;
7673*c217d954SCole Faust
7674*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7675*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
7676*c217d954SCole Faust
7677*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7678*c217d954SCole Faust
7679*c217d954SCole Faust        lhs += LHS_STEP_X;
7680*c217d954SCole Faust
7681*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7682*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
7683*c217d954SCole Faust
7684*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7685*c217d954SCole Faust
7686*c217d954SCole Faust        lhs += LHS_STEP_X;
7687*c217d954SCole Faust
7688*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7689*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
7690*c217d954SCole Faust
7691*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7692*c217d954SCole Faust
7693*c217d954SCole Faust        lhs += LHS_STEP_X;
7694*c217d954SCole Faust
7695*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7696*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
7697*c217d954SCole Faust
7698*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7699*c217d954SCole Faust
7700*c217d954SCole Faust        lhs += LHS_STEP_X;
7701*c217d954SCole Faust
7702*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7703*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
7704*c217d954SCole Faust
7705*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7706*c217d954SCole Faust
7707*c217d954SCole Faust        lhs += LHS_STEP_X;
7708*c217d954SCole Faust
7709*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7710*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
7711*c217d954SCole Faust
7712*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7713*c217d954SCole Faust
7714*c217d954SCole Faust        lhs += LHS_STEP_X;
7715*c217d954SCole Faust
7716*c217d954SCole Faust        a0 = VLOAD(M0)(0, lhs);
7717*c217d954SCole Faust        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
7718*c217d954SCole Faust
7719*c217d954SCole Faust        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7720*c217d954SCole Faust
7721*c217d954SCole Faust        lhs += LHS_STEP_X;
7722*c217d954SCole Faust#endif
7723*c217d954SCole Faust
7724*c217d954SCole Faust#ifndef LHS_INTERLEAVE
7725*c217d954SCole Faust        lhs += (M0 * K0 * (V0 - 1));
7726*c217d954SCole Faust#endif
7727*c217d954SCole Faust
7728*c217d954SCole Faust        x_rhs += K0 * RHS_STEP_X;
7729*c217d954SCole Faust#ifndef RHS_INTERLEAVE
7730*c217d954SCole Faust        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
7731*c217d954SCole Faust#endif
7732*c217d954SCole Faust    }
7733*c217d954SCole Faust
7734*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
7735*c217d954SCole Faust
7736*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
7737*c217d954SCole Faust
7738*c217d954SCole Faust    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
7739*c217d954SCole Faust    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
7740*c217d954SCole Faust
7741*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
7742*c217d954SCole Faust
7743*c217d954SCole Faust
7744*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
7745*c217d954SCole Faust
7746*c217d954SCole Faust
7747*c217d954SCole Faust    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
7748*c217d954SCole Faust
7749*c217d954SCole Faust#else
7750*c217d954SCole Faust
7751*c217d954SCole Faust
7752*c217d954SCole Faust    dst_addr += z * dst_stride_z;
7753*c217d954SCole Faust
7754*c217d954SCole Faust#endif
7755*c217d954SCole Faust
7756*c217d954SCole Faust
7757*c217d954SCole Faust#if defined(ALPHA)
7758*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
7759*c217d954SCole Faust#endif
7760*c217d954SCole Faust
7761*c217d954SCole Faust
7762*c217d954SCole Faust#if defined(BETA)
7763*c217d954SCole Faust#if defined(BROADCAST_BIAS)
7764*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
7765*c217d954SCole Faust
7766*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
7767*c217d954SCole Faust
7768*c217d954SCole Faust#ifndef UNIT_BETA
7769*c217d954SCole Faust    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
7770*c217d954SCole Faust#endif
7771*c217d954SCole Faust
7772*c217d954SCole Faust
7773*c217d954SCole Faust#if defined(MIXED_PRECISION)
7774*c217d954SCole Faust    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7775*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
7776*c217d954SCole Faust#else
7777*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias0);
7778*c217d954SCole Faust#endif
7779*c217d954SCole Faust
7780*c217d954SCole Faust#else
7781*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
7782*c217d954SCole Faust
7783*c217d954SCole Faust    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7784*c217d954SCole Faust
7785*c217d954SCole Faust#ifndef UNIT_BETA
7786*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
7787*c217d954SCole Faust#endif
7788*c217d954SCole Faust
7789*c217d954SCole Faust#if defined(MIXED_PRECISION)
7790*c217d954SCole Faust    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7791*c217d954SCole Faust    ADD_BLOCK(M0, c, bias_hp);
7792*c217d954SCole Faust#else
7793*c217d954SCole Faust    ADD_BLOCK(M0, c, bias);
7794*c217d954SCole Faust#endif
7795*c217d954SCole Faust
7796*c217d954SCole Faust#endif
7797*c217d954SCole Faust#endif
7798*c217d954SCole Faust
7799*c217d954SCole Faust#if defined(ACTIVATION_TYPE)
7800*c217d954SCole Faust#if defined(MIXED_PRECISION)
7801*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
7802*c217d954SCole Faust#else
7803*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
7804*c217d954SCole Faust#endif
7805*c217d954SCole Faust#endif
7806*c217d954SCole Faust
7807*c217d954SCole Faust
7808*c217d954SCole Faust#if defined(MIXED_PRECISION)
7809*c217d954SCole Faust    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
7810*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7811*c217d954SCole Faust#else
7812*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7813*c217d954SCole Faust#endif
7814*c217d954SCole Faust
7815*c217d954SCole Faust#undef LHS_BLOCK_SIZE
7816*c217d954SCole Faust#undef LHS_OFFSET_X
7817*c217d954SCole Faust#undef LHS_STEP_X
7818*c217d954SCole Faust#undef RHS_BLOCK_SIZE
7819*c217d954SCole Faust#undef RHS_OFFSET_X
7820*c217d954SCole Faust#undef RHS_STEP_X
7821*c217d954SCole Faust#undef PIXEL_UNIT
7822*c217d954SCole Faust#undef LHS_STEP_LOOP
7823*c217d954SCole Faust#undef RHS_STEP_LOOP
7824*c217d954SCole Faust}
7825*c217d954SCole Faust#endif
7826*c217d954SCole Faust
7827*c217d954SCole Faust#endif
7828*c217d954SCole Faust
7829*c217d954SCole Faust#endif
7830*c217d954SCole Faust
7831*c217d954SCole Faust#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)
7832*c217d954SCole Faust
7833*c217d954SCole Faust#define VFMA(a, b, c)     \
7834*c217d954SCole Faust    ({                    \
7835*c217d954SCole Faust        c = fma(a, b, c); \
7836*c217d954SCole Faust    })
7837*c217d954SCole Faust
7838*c217d954SCole Faust#if M0 == 1
7839*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7840*c217d954SCole Faust    ({                                                                \
7841*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7842*c217d954SCole Faust    })
7843*c217d954SCole Faust#elif M0 == 2
7844*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7845*c217d954SCole Faust    ({                                                                \
7846*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7847*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7848*c217d954SCole Faust    })
7849*c217d954SCole Faust#elif M0 == 3
7850*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7851*c217d954SCole Faust    ({                                                                \
7852*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7853*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7854*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7855*c217d954SCole Faust    })
7856*c217d954SCole Faust#elif M0 == 4
7857*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7858*c217d954SCole Faust    ({                                                                \
7859*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7860*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7861*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7862*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
7863*c217d954SCole Faust    })
7864*c217d954SCole Faust#elif M0 == 5
7865*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7866*c217d954SCole Faust    ({                                                                \
7867*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7868*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7869*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7870*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
7871*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
7872*c217d954SCole Faust    })
7873*c217d954SCole Faust#elif M0 == 6
7874*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7875*c217d954SCole Faust    ({                                                                \
7876*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7877*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7878*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7879*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
7880*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
7881*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
7882*c217d954SCole Faust    })
7883*c217d954SCole Faust#elif M0 == 7
7884*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7885*c217d954SCole Faust    ({                                                                \
7886*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7887*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7888*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7889*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
7890*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
7891*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
7892*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
7893*c217d954SCole Faust    })
7894*c217d954SCole Faust#elif M0 == 8
7895*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7896*c217d954SCole Faust    ({                                                                \
7897*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7898*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7899*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7900*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
7901*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
7902*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
7903*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
7904*c217d954SCole Faust        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
7905*c217d954SCole Faust    })
7906*c217d954SCole Faust#else
7907*c217d954SCole Faust#error "M0 not supported"
7908*c217d954SCole Faust#endif
7909*c217d954SCole Faust
7910*c217d954SCole Faust#if defined(GEMM_MM_NATIVE)
7911*c217d954SCole Faust
7912*c217d954SCole Faust__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
7913*c217d954SCole Faust                             IMAGE_DECLARATION(rhs),
7914*c217d954SCole Faust#if defined(BETA)
7915*c217d954SCole Faust                             IMAGE_DECLARATION(bias),
7916*c217d954SCole Faust#endif
7917*c217d954SCole Faust                             IMAGE_DECLARATION(dst),
7918*c217d954SCole Faust                             uint lhs_stride_z,
7919*c217d954SCole Faust                             uint rhs_stride_z,
7920*c217d954SCole Faust#if defined(BETA)
7921*c217d954SCole Faust                             uint bias_stride_z,
7922*c217d954SCole Faust#endif
7923*c217d954SCole Faust                             uint      dst_stride_z,
7924*c217d954SCole Faust                             const int M,
7925*c217d954SCole Faust                             const int N,
7926*c217d954SCole Faust                             const int K
7927*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
7928*c217d954SCole Faust                             ,
7929*c217d954SCole Faust                             uint lhs_cross_plane_pad
7930*c217d954SCole Faust#endif
7931*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
7932*c217d954SCole Faust                             ,
7933*c217d954SCole Faust                             uint dst_cross_plane_pad
7934*c217d954SCole Faust#endif
7935*c217d954SCole Faust                            )
7936*c217d954SCole Faust{
7937*c217d954SCole Faust
7938*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0))
7939*c217d954SCole Faust
7940*c217d954SCole Faust
7941*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
7942*c217d954SCole Faust
7943*c217d954SCole Faust    uint x = get_global_id(0);
7944*c217d954SCole Faust    uint y = get_global_id(1);
7945*c217d954SCole Faust    uint z = get_global_id(2);
7946*c217d954SCole Faust
7947*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
7948*c217d954SCole Faust    if((x * N0 >= N) || (y * M0 >= M))
7949*c217d954SCole Faust    {
7950*c217d954SCole Faust        return;
7951*c217d954SCole Faust    }
7952*c217d954SCole Faust#endif
7953*c217d954SCole Faust
7954*c217d954SCole Faust
7955*c217d954SCole Faust    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
7956*c217d954SCole Faust
7957*c217d954SCole Faust
7958*c217d954SCole Faust    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
7959*c217d954SCole Faust
7960*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
7961*c217d954SCole Faust
7962*c217d954SCole Faust    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
7963*c217d954SCole Faust#else
7964*c217d954SCole Faust    rhs_offset += z * rhs_stride_z;
7965*c217d954SCole Faust#endif
7966*c217d954SCole Faust
7967*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
7968*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
7969*c217d954SCole Faust
7970*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
7971*c217d954SCole Faust
7972*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
7973*c217d954SCole Faust
7974*c217d954SCole Faust
7975*c217d954SCole Faust
7976*c217d954SCole Faust    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
7977*c217d954SCole Faust
7978*c217d954SCole Faust#else
7979*c217d954SCole Faust
7980*c217d954SCole Faust
7981*c217d954SCole Faust    lhs_offset += z * lhs_stride_z;
7982*c217d954SCole Faust
7983*c217d954SCole Faust#endif
7984*c217d954SCole Faust
7985*c217d954SCole Faust
7986*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
7987*c217d954SCole Faust
7988*c217d954SCole Faust    int i = 0;
7989*c217d954SCole Faust#if K0 > 1
7990*c217d954SCole Faust    for(; i <= (K - K0); i += K0)
7991*c217d954SCole Faust    {
7992*c217d954SCole Faust
7993*c217d954SCole Faust
7994*c217d954SCole Faust
7995*c217d954SCole Faust
7996*c217d954SCole Faust
7997*c217d954SCole Faust
7998*c217d954SCole Faust
7999*c217d954SCole Faust
8000*c217d954SCole Faust
8001*c217d954SCole Faust
8002*c217d954SCole Faust        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
8003*c217d954SCole Faust
8004*c217d954SCole Faust
8005*c217d954SCole Faust        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
8006*c217d954SCole Faust
8007*c217d954SCole Faust        RHS_VFMA_M0xN0(0, a, b0, c);
8008*c217d954SCole Faust        RHS_VFMA_M0xN0(1, a, b1, c);
8009*c217d954SCole Faust#if K0 > 2
8010*c217d954SCole Faust        RHS_VFMA_M0xN0(2, a, b2, c);
8011*c217d954SCole Faust#endif
8012*c217d954SCole Faust#if K0 > 3
8013*c217d954SCole Faust        RHS_VFMA_M0xN0(3, a, b3, c);
8014*c217d954SCole Faust#endif
8015*c217d954SCole Faust#if K0 > 4
8016*c217d954SCole Faust        RHS_VFMA_M0xN0(4, a, b4, c);
8017*c217d954SCole Faust        RHS_VFMA_M0xN0(5, a, b5, c);
8018*c217d954SCole Faust        RHS_VFMA_M0xN0(6, a, b6, c);
8019*c217d954SCole Faust        RHS_VFMA_M0xN0(7, a, b7, c);
8020*c217d954SCole Faust#endif
8021*c217d954SCole Faust#if K0 > 8
8022*c217d954SCole Faust        RHS_VFMA_M0xN0(8, a, b8, c);
8023*c217d954SCole Faust        RHS_VFMA_M0xN0(9, a, b9, c);
8024*c217d954SCole Faust        RHS_VFMA_M0xN0(A, a, bA, c);
8025*c217d954SCole Faust        RHS_VFMA_M0xN0(B, a, bB, c);
8026*c217d954SCole Faust        RHS_VFMA_M0xN0(C, a, bC, c);
8027*c217d954SCole Faust        RHS_VFMA_M0xN0(D, a, bD, c);
8028*c217d954SCole Faust        RHS_VFMA_M0xN0(E, a, bE, c);
8029*c217d954SCole Faust        RHS_VFMA_M0xN0(F, a, bF, c);
8030*c217d954SCole Faust#endif
8031*c217d954SCole Faust
8032*c217d954SCole Faust        lhs_offset += K0 * sizeof(DATA_TYPE);
8033*c217d954SCole Faust        rhs_offset += K0 * rhs_stride_y;
8034*c217d954SCole Faust    }
8035*c217d954SCole Faust#endif
8036*c217d954SCole Faust
8037*c217d954SCole Faust    for(; i < K; ++i)
8038*c217d954SCole Faust    {
8039*c217d954SCole Faust
8040*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
8041*c217d954SCole Faust        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
8042*c217d954SCole Faust#if M0 > 1
8043*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
8044*c217d954SCole Faust        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
8045*c217d954SCole Faust#endif
8046*c217d954SCole Faust#if M0 > 2
8047*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
8048*c217d954SCole Faust        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
8049*c217d954SCole Faust#endif
8050*c217d954SCole Faust#if M0 > 3
8051*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
8052*c217d954SCole Faust        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
8053*c217d954SCole Faust#endif
8054*c217d954SCole Faust#if M0 > 4
8055*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
8056*c217d954SCole Faust        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
8057*c217d954SCole Faust#endif
8058*c217d954SCole Faust#if M0 > 5
8059*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
8060*c217d954SCole Faust        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
8061*c217d954SCole Faust#endif
8062*c217d954SCole Faust#if M0 > 6
8063*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
8064*c217d954SCole Faust        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
8065*c217d954SCole Faust#endif
8066*c217d954SCole Faust#if M0 > 7
8067*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 2)
8068*c217d954SCole Faust        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
8069*c217d954SCole Faust#endif
8070*c217d954SCole Faust
8071*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, N0)
8072*c217d954SCole Faust        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
8073*c217d954SCole Faust        RHS_VFMA_M0xN0(0, a, b, c);
8074*c217d954SCole Faust
8075*c217d954SCole Faust        lhs_offset += sizeof(DATA_TYPE);
8076*c217d954SCole Faust        rhs_offset += rhs_stride_y;
8077*c217d954SCole Faust    }
8078*c217d954SCole Faust
8079*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
8080*c217d954SCole Faust
8081*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
8082*c217d954SCole Faust
8083*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
8084*c217d954SCole Faust
8085*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
8086*c217d954SCole Faust
8087*c217d954SCole Faust
8088*c217d954SCole Faust
8089*c217d954SCole Faust    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
8090*c217d954SCole Faust
8091*c217d954SCole Faust#else
8092*c217d954SCole Faust
8093*c217d954SCole Faust
8094*c217d954SCole Faust    dst_addr += z * dst_stride_z;
8095*c217d954SCole Faust
8096*c217d954SCole Faust#endif
8097*c217d954SCole Faust
8098*c217d954SCole Faust
8099*c217d954SCole Faust#if defined(ALPHA)
8100*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
8101*c217d954SCole Faust#endif
8102*c217d954SCole Faust
8103*c217d954SCole Faust
8104*c217d954SCole Faust#if defined(BETA)
8105*c217d954SCole Faust#if defined(BROADCAST_BIAS)
8106*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
8107*c217d954SCole Faust
8108*c217d954SCole Faust    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
8109*c217d954SCole Faust
8110*c217d954SCole Faust#ifndef UNIT_BETA
8111*c217d954SCole Faust    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
8112*c217d954SCole Faust#endif
8113*c217d954SCole Faust
8114*c217d954SCole Faust
8115*c217d954SCole Faust    ADD_BLOCK_BROADCAST(M0, c, bias0);
8116*c217d954SCole Faust
8117*c217d954SCole Faust#else
8118*c217d954SCole Faust    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
8119*c217d954SCole Faust
8120*c217d954SCole Faust    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
8121*c217d954SCole Faust
8122*c217d954SCole Faust#ifndef UNIT_BETA
8123*c217d954SCole Faust    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
8124*c217d954SCole Faust#endif
8125*c217d954SCole Faust
8126*c217d954SCole Faust
8127*c217d954SCole Faust    ADD_BLOCK(M0, c, bias);
8128*c217d954SCole Faust
8129*c217d954SCole Faust#endif
8130*c217d954SCole Faust#endif
8131*c217d954SCole Faust
8132*c217d954SCole Faust#if defined(ACTIVATION_TYPE)
8133*c217d954SCole Faust    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
8134*c217d954SCole Faust#endif
8135*c217d954SCole Faust
8136*c217d954SCole Faust    const bool cond_y = y == 0;
8137*c217d954SCole Faust    const bool cond_x = ((x + 1) * N0 >= N);
8138*c217d954SCole Faust
8139*c217d954SCole Faust
8140*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
8141*c217d954SCole Faust}
8142*c217d954SCole Faust#endif
8143*c217d954SCole Faust#endif
8144*c217d954SCole Faust
8145*c217d954SCole Faust#if defined(BETA)
8146*c217d954SCole Faust
8147*c217d954SCole Faust__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
8148*c217d954SCole Faust                          TENSOR3D_DECLARATION(dst))
8149*c217d954SCole Faust{
8150*c217d954SCole Faust
8151*c217d954SCole Faust    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
8152*c217d954SCole Faust    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
8153*c217d954SCole Faust
8154*c217d954SCole Faust
8155*c217d954SCole Faust    float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
8156*c217d954SCole Faust
8157*c217d954SCole Faust
8158*c217d954SCole Faust    float4 c = vload4(0, (__global float *)src.ptr);
8159*c217d954SCole Faust
8160*c217d954SCole Faust
8161*c217d954SCole Faust    float4 out = alpha_ab + (float4)BETA * c;
8162*c217d954SCole Faust
8163*c217d954SCole Faust
8164*c217d954SCole Faust    vstore4(out, 0, (__global float *)dst.ptr);
8165*c217d954SCole Faust}
8166*c217d954SCole Faust
8167*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
8168*c217d954SCole Faust
8169*c217d954SCole Faust__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
8170*c217d954SCole Faust                          TENSOR3D_DECLARATION(dst))
8171*c217d954SCole Faust{
8172*c217d954SCole Faust
8173*c217d954SCole Faust    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
8174*c217d954SCole Faust    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
8175*c217d954SCole Faust
8176*c217d954SCole Faust
8177*c217d954SCole Faust    half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
8178*c217d954SCole Faust
8179*c217d954SCole Faust
8180*c217d954SCole Faust    half8 c = vload8(0, (__global half *)src.ptr);
8181*c217d954SCole Faust
8182*c217d954SCole Faust
8183*c217d954SCole Faust    half8 out = alpha_ab + (half8)BETA * c;
8184*c217d954SCole Faust
8185*c217d954SCole Faust
8186*c217d954SCole Faust    vstore8(out, 0, (__global half *)dst.ptr);
8187*c217d954SCole Faust}
8188*c217d954SCole Faust#endif
8189*c217d954SCole Faust#endif  )"