xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/common/range.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 R"(
2 
3 #ifndef ARM_COMPUTE_HELPER_H
4 #define ARM_COMPUTE_HELPER_H
5 
6 
7 
8 
9 #define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
10     VSTORE(N0)                                                 \
11     (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
12 
13 #define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
14     STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
15     VSTORE(N0)                                                 \
16     (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
17 
18 #define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
19     STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
20     VSTORE(N0)                                                 \
21     (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
22 
23 #define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
24     STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
25     VSTORE(N0)                                                 \
26     (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
27 
28 #define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
29     STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
30     VSTORE(N0)                                                 \
31     (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
32 
33 #define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
34     STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
35     VSTORE(N0)                                                 \
36     (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
37 
38 #define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
39     STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
40     VSTORE(N0)                                                 \
41     (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
42 
43 #define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
44     STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
45     VSTORE(N0)                                                 \
46     (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
47 
48 #define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
49     STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
50     VSTORE(N0)                                                 \
51     (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
52 
53 #define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
54     STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
55     VSTORE(N0)                                                  \
56     (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
57 
58 #define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
59     STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
60     VSTORE(N0)                                                  \
61     (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
62 
63 #define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
64     STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
65     VSTORE(N0)                                                  \
66     (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
67 
68 #define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
69     STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
70     VSTORE(N0)                                                  \
71     (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
72 
73 #define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
74     STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
75     VSTORE(N0)                                                  \
76     (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
77 
78 #define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
79     STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
80     VSTORE(N0)                                                  \
81     (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
82 
83 #define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
84     STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
85     VSTORE(N0)                                                  \
86     (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
87 
88 
89 
90 #define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
91     VSTORE(N0)                                                         \
92     (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
93 
94 #define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
95     CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
96     VSTORE(N0)                                                         \
97     (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
98 
99 #define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
100     CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
101     VSTORE(N0)                                                         \
102     (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
103 
104 #define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
105     CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
106     VSTORE(N0)                                                         \
107     (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
108 
109 #define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
110     CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
111     VSTORE(N0)                                                         \
112     (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
113 
114 #define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
115     CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
116     VSTORE(N0)                                                         \
117     (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
118 
119 #define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
120     CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
121     VSTORE(N0)                                                         \
122     (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
123 
124 #define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
125     CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
126     VSTORE(N0)                                                         \
127     (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
128 
129 #define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
130     CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
131     VSTORE(N0)                                                         \
132     (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
133 
134 #define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
135     CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
136     VSTORE(N0)                                                     \
137     (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
138 
139 #define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
140     CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
141     VSTORE(N0)                                                          \
142     (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
143 
144 #define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
145     CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
146     VSTORE(N0)                                                          \
147     (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
148 
149 #define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
150     CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
151     VSTORE(N0)                                                          \
152     (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
153 
154 #define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
155     CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
156     VSTORE(N0)                                                          \
157     (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
158 
159 #define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
160     CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
161     VSTORE(N0)                                                          \
162     (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
163 
164 #define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
165     CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
166     VSTORE(N0)                                                          \
167     (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
168 
169 
170 
171 
172 #define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
173 #define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
174 
175 
176 
177 #define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
178 #define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
179 
180 
181 
182 #define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
183     VSTORE_PARTIAL(N0, STORE_N0)                                                 \
184     (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
185 
186 #define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
187     STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
188     VSTORE_PARTIAL(N0, STORE_N0)                                                 \
189     (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
190 
191 #define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
192     STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
193     VSTORE_PARTIAL(N0, STORE_N0)                                                 \
194     (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
195 
196 #define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
197     STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
198     VSTORE_PARTIAL(N0, STORE_N0)                                                 \
199     (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
200 
201 #define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
202     STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
203     VSTORE_PARTIAL(N0, STORE_N0)                                                 \
204     (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
205 
206 #define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
207     STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
208     VSTORE_PARTIAL(N0, STORE_N0)                                                 \
209     (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
210 
211 #define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
212     STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
213     VSTORE_PARTIAL(N0, STORE_N0)                                                 \
214     (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
215 
216 #define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
217     STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
218     VSTORE_PARTIAL(N0, STORE_N0)                                                 \
219     (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
220 
221 #define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
222     STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
223     VSTORE_PARTIAL(N0, STORE_N0)                                                 \
224     (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
225 
226 #define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
227     STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
228     VSTORE_PARTIAL(N0, STORE_N0)                                                  \
229     (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
230 
231 #define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
232     STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
233     VSTORE_PARTIAL(N0, STORE_N0)                                                  \
234     (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
235 
236 #define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
237     STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
238     VSTORE_PARTIAL(N0, STORE_N0)                                                  \
239     (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
240 
241 #define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
242     STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
243     VSTORE_PARTIAL(N0, STORE_N0)                                                  \
244     (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
245 
246 #define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
247     STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
248     VSTORE_PARTIAL(N0, STORE_N0)                                                  \
249     (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
250 
251 #define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
252     STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
253     VSTORE_PARTIAL(N0, STORE_N0)                                                  \
254     (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
255 
256 #define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
257     STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
258     VSTORE_PARTIAL(N0, STORE_N0)                                                  \
259     (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
260 
261 
262 
263 #define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
264 #define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
265 
266 #define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
267     if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
268     {                                                                                                                                                     \
269         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
270     }                                                                                                                                                     \
271     else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
272     {                                                                                                                                                     \
273         STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
274     }                                                                                                                                                     \
275     else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
276     {                                                                                                                                                     \
277         STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
278     }                                                                                                                                                     \
279     else                                                                                                                                                  \
280     {                                                                                                                                                     \
281         STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
282     }
283 
284 #define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
285     if(!(PARTIAL_COND_X))                                                                                         \
286     {                                                                                                             \
287         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
288     }                                                                                                             \
289     else                                                                                                          \
290     {                                                                                                             \
291         STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
292     }
293 
294 #define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
295     if(!(PARTIAL_COND_Y))                                                                                         \
296     {                                                                                                             \
297         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
298     }                                                                                                             \
299     else                                                                                                          \
300     {                                                                                                             \
301         STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
302     }
303 
304 
305 #if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
306 
307 
308 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
309 
310 #define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
311     STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
312 
313 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
314 
315 #define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
316     STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
317 
318 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
319 
320 #define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
321     STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
322 
323 #else
324 
325 #define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
326     STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
327 
328 #endif
329 
330 #endif
331 
332 
333 #if defined(PARTIAL_STORE_M0)
334 
335 #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
336     ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
337 #else
338 #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339     ((uint)(y * M0))
340 #endif
341 
342 
343 
344 #define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
345     STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
346 
347 
348 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
349 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
350 #endif
351 
352 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
353 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
354 #endif
355 
356 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
357 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
358 #endif
359 
360 #if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
361 #pragma OPENCL EXTENSION cl_arm_printf : enable
362 #endif
363 
364 #define GPU_ARCH_MIDGARD 0x100
365 #define GPU_ARCH_BIFROST 0x200
366 #define GPU_ARCH_VALHALL 0x300
367 
368 
369 #define CONCAT(a, b) a##b
370 
371 
372 #define EXPAND(x) x
373 
374 
375 #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
376 
377 
378 #define REV1(x) ((x))
379 #define REV2(x) ((x).s10)
380 #define REV3(x) ((x).s210)
381 #define REV4(x) ((x).s3210)
382 #define REV8(x) ((x).s76543210)
383 #define REV16(x) ((x).sFEDCBA9876543210)
384 
385 
386 
387 #define REVERSE_STR(x, s) REV##s((x))
388 #define REVERSE(x, s) REVERSE_STR(x, s)
389 
390 
391 
392 #define ROT1_0(x) ((x))
393 #define ROT1_1(x) ((x))
394 
395 #define ROT2_0(x) ((x))
396 #define ROT2_1(x) ((x).s10)
397 #define ROT2_2(x) ((x))
398 
399 #define ROT3_0(x) ((x))
400 #define ROT3_1(x) ((x).s201)
401 #define ROT3_2(x) ((x).s120)
402 #define ROT3_3(x) ((x))
403 
404 #define ROT4_0(x) ((x))
405 #define ROT4_1(x) ((x).s3012)
406 #define ROT4_2(x) ((x).s2301)
407 #define ROT4_3(x) ((x).s1230)
408 #define ROT4_4(x) ((x))
409 
410 #define ROT8_0(x) ((x))
411 #define ROT8_1(x) ((x).s70123456)
412 #define ROT8_2(x) ((x).s67012345)
413 #define ROT8_3(x) ((x).s56701234)
414 #define ROT8_4(x) ((x).s45670123)
415 #define ROT8_5(x) ((x).s34567012)
416 #define ROT8_6(x) ((x).s23456701)
417 #define ROT8_7(x) ((x).s12345670)
418 #define ROT8_8(x) ((x))
419 
420 #define ROT16_0(x) ((x))
421 #define ROT16_1(x) ((x).sF0123456789ABCDE)
422 #define ROT16_2(x) ((x).sEF0123456789ABCD)
423 #define ROT16_3(x) ((x).sDEF0123456789ABC)
424 #define ROT16_4(x) ((x).sCDEF0123456789AB)
425 #define ROT16_5(x) ((x).sBCDEF0123456789A)
426 #define ROT16_6(x) ((x).sABCDEF0123456789)
427 #define ROT16_7(x) ((x).s9ABCDEF012345678)
428 #define ROT16_8(x) ((x).s89ABCDEF01234567)
429 #define ROT16_9(x) ((x).s789ABCDEF0123456)
430 #define ROT16_10(x) ((x).s6789ABCDEF012345)
431 #define ROT16_11(x) ((x).s56789ABCDEF01234)
432 #define ROT16_12(x) ((x).s456789ABCDEF0123)
433 #define ROT16_13(x) ((x).s3456789ABCDEF012)
434 #define ROT16_14(x) ((x).s23456789ABCDEF01)
435 #define ROT16_15(x) ((x).s123456789ABCDEF0)
436 #define ROT16_16(x) ((x))
437 
438 
439 
440 #define ROTATE_STR(x, s, n) ROT##s##_##n(x)
441 #define ROTATE(x, s, n) ROTATE_STR(x, s, n)
442 
443 
444 
445 #define V_OFFS1(dt) (dt##1)(0)
446 #define V_OFFS2(dt) (dt##2)(0, 1)
447 #define V_OFFS3(dt) (dt##3)(0, 1, 2)
448 #define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
449 #define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
450 #define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
451 
452 
453 
454 #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
455 #define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
456 
457 
458 #define VLOAD_STR(size) vload##size
459 #define VLOAD(size) VLOAD_STR(size)
460 
461 
462 #define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
463 #define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
464 
465 #define NO_LOAD(data, offs, ptr) \
466     {                            \
467     }
468 
469 
470 #define vload_partial_1_0 NO_LOAD
471 #define vload_partial_1_1 vload1
472 #define vload_partial_1_2 NO_LOAD
473 #define vload_partial_1_3 NO_LOAD
474 #define vload_partial_1_4 NO_LOAD
475 #define vload_partial_1_5 NO_LOAD
476 #define vload_partial_1_6 NO_LOAD
477 #define vload_partial_1_7 NO_LOAD
478 #define vload_partial_1_8 NO_LOAD
479 #define vload_partial_1_9 NO_LOAD
480 #define vload_partial_1_10 NO_LOAD
481 #define vload_partial_1_11 NO_LOAD
482 #define vload_partial_1_12 NO_LOAD
483 #define vload_partial_1_13 NO_LOAD
484 #define vload_partial_1_14 NO_LOAD
485 #define vload_partial_1_15 NO_LOAD
486 #define vload_partial_1_16 NO_LOAD
487 
488 #define vload_partial_2_0 NO_LOAD
489 #define vload_partial_2_1 vload_partial_1
490 #define vload_partial_2_2 vload_partial_2
491 #define vload_partial_2_3 NO_LOAD
492 #define vload_partial_2_4 NO_LOAD
493 #define vload_partial_2_5 NO_LOAD
494 #define vload_partial_2_6 NO_LOAD
495 #define vload_partial_2_7 NO_LOAD
496 #define vload_partial_2_8 NO_LOAD
497 #define vload_partial_2_9 NO_LOAD
498 #define vload_partial_2_10 NO_LOAD
499 #define vload_partial_2_11 NO_LOAD
500 #define vload_partial_2_12 NO_LOAD
501 #define vload_partial_2_13 NO_LOAD
502 #define vload_partial_2_14 NO_LOAD
503 #define vload_partial_2_15 NO_LOAD
504 #define vload_partial_2_16 NO_LOAD
505 
506 #define vload_partial_3_0 NO_LOAD
507 #define vload_partial_3_1 vload_partial_1
508 #define vload_partial_3_2 vload_partial_2
509 #define vload_partial_3_3 vload_partial_3
510 #define vload_partial_3_4 NO_LOAD
511 #define vload_partial_3_5 NO_LOAD
512 #define vload_partial_3_6 NO_LOAD
513 #define vload_partial_3_7 NO_LOAD
514 #define vload_partial_3_8 NO_LOAD
515 #define vload_partial_3_9 NO_LOAD
516 #define vload_partial_3_10 NO_LOAD
517 #define vload_partial_3_11 NO_LOAD
518 #define vload_partial_3_12 NO_LOAD
519 #define vload_partial_3_13 NO_LOAD
520 #define vload_partial_3_14 NO_LOAD
521 #define vload_partial_3_15 NO_LOAD
522 #define vload_partial_3_16 NO_LOAD
523 
524 #define vload_partial_4_0 NO_LOAD
525 #define vload_partial_4_1 vload_partial_1
526 #define vload_partial_4_2 vload_partial_2
527 #define vload_partial_4_3 vload_partial_3
528 #define vload_partial_4_4 vload_partial_4
529 #define vload_partial_4_5 NO_LOAD
530 #define vload_partial_4_6 NO_LOAD
531 #define vload_partial_4_7 NO_LOAD
532 #define vload_partial_4_8 NO_LOAD
533 #define vload_partial_4_9 NO_LOAD
534 #define vload_partial_4_10 NO_LOAD
535 #define vload_partial_4_11 NO_LOAD
536 #define vload_partial_4_12 NO_LOAD
537 #define vload_partial_4_13 NO_LOAD
538 #define vload_partial_4_14 NO_LOAD
539 #define vload_partial_4_15 NO_LOAD
540 #define vload_partial_4_16 NO_LOAD
541 
542 #define vload_partial_8_0 NO_LOAD
543 #define vload_partial_8_1 vload_partial_1
544 #define vload_partial_8_2 vload_partial_2
545 #define vload_partial_8_3 vload_partial_3
546 #define vload_partial_8_4 vload_partial_4
547 #define vload_partial_8_5 vload_partial_5
548 #define vload_partial_8_6 vload_partial_6
549 #define vload_partial_8_7 vload_partial_7
550 #define vload_partial_8_8 vload_partial_8
551 #define vload_partial_8_9 NO_LOAD
552 #define vload_partial_8_10 NO_LOAD
553 #define vload_partial_8_11 NO_LOAD
554 #define vload_partial_8_12 NO_LOAD
555 #define vload_partial_8_13 NO_LOAD
556 #define vload_partial_8_14 NO_LOAD
557 #define vload_partial_8_15 NO_LOAD
558 #define vload_partial_8_16 NO_LOAD
559 
560 #define vload_partial_16_0 NO_LOAD
561 #define vload_partial_16_1 vload_partial_1
562 #define vload_partial_16_2 vload_partial_2
563 #define vload_partial_16_3 vload_partial_3
564 #define vload_partial_16_4 vload_partial_4
565 #define vload_partial_16_5 vload_partial_5
566 #define vload_partial_16_6 vload_partial_6
567 #define vload_partial_16_7 vload_partial_7
568 #define vload_partial_16_8 vload_partial_8
569 #define vload_partial_16_9 vload_partial_9
570 #define vload_partial_16_10 vload_partial_10
571 #define vload_partial_16_11 vload_partial_11
572 #define vload_partial_16_12 vload_partial_12
573 #define vload_partial_16_13 vload_partial_13
574 #define vload_partial_16_14 vload_partial_14
575 #define vload_partial_16_15 vload_partial_15
576 #define vload_partial_16_16 vload_partial_16
577 
578 
579 #define vload_partial_1(DATA, OFFSET, PTR) \
580     DATA.s0 = vload1(OFFSET, PTR);
581 
582 #define vload_partial_2(DATA, OFFSET, PTR) \
583     DATA.s01 = vload2(OFFSET, PTR);
584 
585 #define vload_partial_3(DATA, OFFSET, PTR) \
586     DATA.s012 = vload3(OFFSET, PTR);
587 
588 #define vload_partial_4(DATA, OFFSET, PTR) \
589     DATA.s0123 = vload4(OFFSET, PTR);
590 
591 #define vload_partial_5(DATA, OFFSET, PTR)    \
592     vload_partial_4(DATA.s0123, OFFSET, PTR); \
593     DATA.s4 = vload1(OFFSET, PTR + 4);
594 
595 #define vload_partial_6(DATA, OFFSET, PTR)    \
596     vload_partial_4(DATA.s0123, OFFSET, PTR); \
597     vload_partial_2(DATA.s45, OFFSET, PTR + 4);
598 
599 #define vload_partial_7(DATA, OFFSET, PTR)    \
600     vload_partial_4(DATA.s0123, OFFSET, PTR); \
601     vload_partial_3(DATA.s456, OFFSET, PTR + 4);
602 
603 #define vload_partial_8(DATA, OFFSET, PTR) \
604     DATA.s01234567 = vload8(OFFSET, PTR);
605 
606 #define vload_partial_9(DATA, OFFSET, PTR)        \
607     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
608     DATA.s8 = vload1(OFFSET, PTR + 8);
609 
610 #define vload_partial_10(DATA, OFFSET, PTR)       \
611     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
612     vload_partial_2(DATA.s89, OFFSET, PTR + 8);
613 
614 #define vload_partial_11(DATA, OFFSET, PTR)       \
615     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
616     vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
617 
618 #define vload_partial_12(DATA, OFFSET, PTR)       \
619     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
620     vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
621 
622 #define vload_partial_13(DATA, OFFSET, PTR)       \
623     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
624     vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
625 
626 #define vload_partial_14(DATA, OFFSET, PTR)       \
627     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
628     vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
629 
630 #define vload_partial_15(DATA, OFFSET, PTR)       \
631     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
632     vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
633 
634 #define vload_partial_16(DATA, OFFSET, PTR) \
635     DATA = vload16(OFFSET, PTR);
636 
637 
638 
639 #define PIXEL_UNIT4 1
640 #define PIXEL_UNIT8 2
641 #define PIXEL_UNIT16 4
642 
643 
644 #define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
645 #define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
646 
647 
648 #define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
649 #define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
650 #define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
651 
652 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
653 #define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
654 #define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
655 #define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
656 #endif
657 
658 #define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
659 #define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
660 #define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
661 
662 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
663 #define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
664 #define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
665 #define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
666 #endif
667 
668 
669 #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
670 #define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
671 
672 
673 #define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
674 #define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
675 
676 #define VSTORE_STR(size) vstore##size
677 #define VSTORE(size) VSTORE_STR(size)
678 
679 #define float1 float
680 #define half1 half
681 #define char1 char
682 #define uchar1 uchar
683 #define short1 short
684 #define ushort1 ushort
685 #define int1 int
686 #define uint1 uint
687 #define long1 long
688 #define ulong1 ulong
689 #define double1 double
690 
691 #define vload1(OFFSET, PTR) *(OFFSET + PTR)
692 #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
693 
694 
695 #define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
696 #define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
697 
698 #define NO_STORE(data, offs, ptr) \
699     {                             \
700     }
701 
702 
703 #define vstore_partial_1_0 NO_STORE
704 #define vstore_partial_1_1 vstore1
705 #define vstore_partial_1_2 NO_STORE
706 #define vstore_partial_1_3 NO_STORE
707 #define vstore_partial_1_4 NO_STORE
708 #define vstore_partial_1_5 NO_STORE
709 #define vstore_partial_1_6 NO_STORE
710 #define vstore_partial_1_7 NO_STORE
711 #define vstore_partial_1_8 NO_STORE
712 #define vstore_partial_1_9 NO_STORE
713 #define vstore_partial_1_10 NO_STORE
714 #define vstore_partial_1_11 NO_STORE
715 #define vstore_partial_1_12 NO_STORE
716 #define vstore_partial_1_13 NO_STORE
717 #define vstore_partial_1_14 NO_STORE
718 #define vstore_partial_1_15 NO_STORE
719 #define vstore_partial_1_16 NO_STORE
720 
721 #define vstore_partial_2_0 NO_STORE
722 #define vstore_partial_2_1 vstore_partial_1
723 #define vstore_partial_2_2 vstore_partial_2
724 #define vstore_partial_2_3 NO_STORE
725 #define vstore_partial_2_4 NO_STORE
726 #define vstore_partial_2_5 NO_STORE
727 #define vstore_partial_2_6 NO_STORE
728 #define vstore_partial_2_7 NO_STORE
729 #define vstore_partial_2_8 NO_STORE
730 #define vstore_partial_2_9 NO_STORE
731 #define vstore_partial_2_10 NO_STORE
732 #define vstore_partial_2_11 NO_STORE
733 #define vstore_partial_2_12 NO_STORE
734 #define vstore_partial_2_13 NO_STORE
735 #define vstore_partial_2_14 NO_STORE
736 #define vstore_partial_2_15 NO_STORE
737 #define vstore_partial_2_16 NO_STORE
738 
739 #define vstore_partial_3_0 NO_STORE
740 #define vstore_partial_3_1 vstore_partial_1
741 #define vstore_partial_3_2 vstore_partial_2
742 #define vstore_partial_3_3 vstore_partial_3
743 #define vstore_partial_3_4 NO_STORE
744 #define vstore_partial_3_5 NO_STORE
745 #define vstore_partial_3_6 NO_STORE
746 #define vstore_partial_3_7 NO_STORE
747 #define vstore_partial_3_8 NO_STORE
748 #define vstore_partial_3_9 NO_STORE
749 #define vstore_partial_3_10 NO_STORE
750 #define vstore_partial_3_11 NO_STORE
751 #define vstore_partial_3_12 NO_STORE
752 #define vstore_partial_3_13 NO_STORE
753 #define vstore_partial_3_14 NO_STORE
754 #define vstore_partial_3_15 NO_STORE
755 #define vstore_partial_3_16 NO_STORE
756 
757 #define vstore_partial_4_0 NO_STORE
758 #define vstore_partial_4_1 vstore_partial_1
759 #define vstore_partial_4_2 vstore_partial_2
760 #define vstore_partial_4_3 vstore_partial_3
761 #define vstore_partial_4_4 vstore_partial_4
762 #define vstore_partial_4_5 NO_STORE
763 #define vstore_partial_4_6 NO_STORE
764 #define vstore_partial_4_7 NO_STORE
765 #define vstore_partial_4_8 NO_STORE
766 #define vstore_partial_4_9 NO_STORE
767 #define vstore_partial_4_10 NO_STORE
768 #define vstore_partial_4_11 NO_STORE
769 #define vstore_partial_4_12 NO_STORE
770 #define vstore_partial_4_13 NO_STORE
771 #define vstore_partial_4_14 NO_STORE
772 #define vstore_partial_4_15 NO_STORE
773 #define vstore_partial_4_16 NO_STORE
774 
775 #define vstore_partial_8_0 NO_STORE
776 #define vstore_partial_8_1 vstore_partial_1
777 #define vstore_partial_8_2 vstore_partial_2
778 #define vstore_partial_8_3 vstore_partial_3
779 #define vstore_partial_8_4 vstore_partial_4
780 #define vstore_partial_8_5 vstore_partial_5
781 #define vstore_partial_8_6 vstore_partial_6
782 #define vstore_partial_8_7 vstore_partial_7
783 #define vstore_partial_8_8 vstore_partial_8
784 #define vstore_partial_8_9 NO_STORE
785 #define vstore_partial_8_10 NO_STORE
786 #define vstore_partial_8_11 NO_STORE
787 #define vstore_partial_8_12 NO_STORE
788 #define vstore_partial_8_13 NO_STORE
789 #define vstore_partial_8_14 NO_STORE
790 #define vstore_partial_8_15 NO_STORE
791 #define vstore_partial_8_16 NO_STORE
792 
793 #define vstore_partial_16_0 NO_STORE
794 #define vstore_partial_16_1 vstore_partial_1
795 #define vstore_partial_16_2 vstore_partial_2
796 #define vstore_partial_16_3 vstore_partial_3
797 #define vstore_partial_16_4 vstore_partial_4
798 #define vstore_partial_16_5 vstore_partial_5
799 #define vstore_partial_16_6 vstore_partial_6
800 #define vstore_partial_16_7 vstore_partial_7
801 #define vstore_partial_16_8 vstore_partial_8
802 #define vstore_partial_16_9 vstore_partial_9
803 #define vstore_partial_16_10 vstore_partial_10
804 #define vstore_partial_16_11 vstore_partial_11
805 #define vstore_partial_16_12 vstore_partial_12
806 #define vstore_partial_16_13 vstore_partial_13
807 #define vstore_partial_16_14 vstore_partial_14
808 #define vstore_partial_16_15 vstore_partial_15
809 #define vstore_partial_16_16 vstore_partial_16
810 
811 
812 #define vstore_partial_1(DATA, OFFSET, PTR) \
813     vstore1(DATA.s0, OFFSET, PTR);
814 
815 #define vstore_partial_2(DATA, OFFSET, PTR) \
816     vstore2(DATA.s01, OFFSET, PTR);
817 
818 #define vstore_partial_3(DATA, OFFSET, PTR) \
819     vstore3(DATA.s012, OFFSET, PTR);
820 
821 #define vstore_partial_4(DATA, OFFSET, PTR) \
822     vstore4(DATA.s0123, OFFSET, PTR);
823 
824 #define vstore_partial_5(DATA, OFFSET, PTR)    \
825     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
826     vstore1(DATA.s4, OFFSET, PTR + 4);
827 
828 #define vstore_partial_6(DATA, OFFSET, PTR)    \
829     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
830     vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
831 
832 #define vstore_partial_7(DATA, OFFSET, PTR)    \
833     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
834     vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
835 
836 #define vstore_partial_8(DATA, OFFSET, PTR) \
837     vstore8(DATA.s01234567, OFFSET, PTR);
838 
839 #define vstore_partial_9(DATA, OFFSET, PTR)        \
840     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
841     vstore1(DATA.s8, OFFSET, PTR + 8);
842 
843 #define vstore_partial_10(DATA, OFFSET, PTR)       \
844     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
845     vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
846 
847 #define vstore_partial_11(DATA, OFFSET, PTR)       \
848     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
849     vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
850 
851 #define vstore_partial_12(DATA, OFFSET, PTR)       \
852     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
853     vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
854 
855 #define vstore_partial_13(DATA, OFFSET, PTR)       \
856     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
857     vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
858 
859 #define vstore_partial_14(DATA, OFFSET, PTR)       \
860     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
861     vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
862 
863 #define vstore_partial_15(DATA, OFFSET, PTR)       \
864     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
865     vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
866 
867 #define vstore_partial_16(DATA, OFFSET, PTR) \
868     vstore16(DATA, OFFSET, PTR);
869 
870 
871 
872 
873 
874 #define convert_float_sat convert_float
875 #define convert_float1_sat convert_float
876 #define convert_float2_sat convert_float2
877 #define convert_float3_sat convert_float3
878 #define convert_float4_sat convert_float4
879 #define convert_float8_sat convert_float8
880 #define convert_float16_sat convert_float16
881 #define convert_half_sat convert_float
882 #define convert_half1_sat convert_half
883 #define convert_half2_sat convert_half2
884 #define convert_half3_sat convert_half3
885 #define convert_half4_sat convert_half4
886 #define convert_half8_sat convert_half8
887 #define convert_half16_sat convert_half16
888 
889 #define convert_float1 convert_float
890 #define convert_half1 convert_half
891 #define convert_char1 convert_char
892 #define convert_uchar1 convert_uchar
893 #define convert_short1 convert_short
894 #define convert_ushort1 convert_ushort
895 #define convert_int1 convert_int
896 #define convert_uint1 convert_uint
897 #define convert_long1 convert_long
898 #define convert_ulong1 convert_ulong
899 #define convert_double1 convert_double
900 
901 #define convert_char1_sat convert_char_sat
902 #define convert_uchar1_sat convert_uchar_sat
903 #define convert_uchar2_sat convert_uchar2_sat
904 #define convert_uchar3_sat convert_uchar3_sat
905 #define convert_uchar4_sat convert_uchar4_sat
906 #define convert_uchar8_sat convert_uchar8_sat
907 #define convert_uchar16_sat convert_uchar16_sat
908 #define convert_short1_sat convert_short_sat
909 #define convert_ushort1_sat convert_ushort_sat
910 #define convert_int1_sat convert_int_sat
911 #define convert_uint1_sat convert_uint_sat
912 #define convert_long1_sat convert_long_sat
913 #define convert_ulong1_sat convert_ulong_sat
914 #define convert_double1_sat convert_double_sat
915 
916 #define VEC_DATA_TYPE_STR(type, size) type##size
917 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
918 
919 #define CONVERT_STR(x, type) (convert_##type((x)))
920 #define CONVERT(x, type) CONVERT_STR(x, type)
921 
922 #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
923 #define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
924 
925 #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
926 #define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
927 
928 #define select_vec_dt_uchar(size) uchar##size
929 #define select_vec_dt_char(size) char##size
930 #define select_vec_dt_ushort(size) ushort##size
931 #define select_vec_dt_short(size) short##size
932 #define select_vec_dt_half(size) short##size
933 #define select_vec_dt_uint(size) uint##size
934 #define select_vec_dt_int(size) int##size
935 #define select_vec_dt_float(size) int##size
936 #define select_vec_dt_ulong(size) ulong##size
937 #define select_vec_dt_long(size) long##size
938 
939 #define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
940 #define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
941 #define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
942 
943 #define signed_int_vec_dt_uchar(size) char##size
944 #define signed_int_vec_dt_char(size) char##size
945 #define signed_int_vec_dt_ushort(size) short##size
946 #define signed_int_vec_dt_short(size) short##size
947 #define signed_int_vec_dt_half(size) short##size
948 #define signed_int_vec_dt_uint(size) int##size
949 #define signed_int_vec_dt_int(size) int##size
950 #define signed_int_vec_dt_float(size) int##size
951 #define signed_int_vec_dt_ulong(size) long##size
952 #define signed_int_vec_dt_long(size) long##size
953 
954 #define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
955 #define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
956 #define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
957 
958 #define sum_reduce_1(x) (x)
959 #define sum_reduce_2(x) ((x).s0) + ((x).s1)
960 #define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
961 #define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
962 #define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
963 #define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
964 
965 #define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
966 #define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
967 
968 #define prod_reduce_1(x) (x)
969 #define prod_reduce_2(x) ((x).s0) * ((x).s1)
970 #define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
971 #define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
972 #define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
973 #define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
974 
975 #define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
976 #define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
977 
978 #define max_reduce_1(x) (x)
979 #define max_reduce_2(x) max(((x).s0), ((x).s1))
980 #define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
981 #define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
982 #define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
983 #define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
984 
985 #define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
986 #define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
987 
988 #define VECTOR_DECLARATION(name)     \
989     __global uchar *name##_ptr,      \
990     uint        name##_stride_x, \
991     uint        name##_step_x,   \
992     uint        name##_offset_first_element_in_bytes
993 
994 #define IMAGE_DECLARATION(name)      \
995     __global uchar *name##_ptr,      \
996     uint        name##_stride_x, \
997     uint        name##_step_x,   \
998     uint        name##_stride_y, \
999     uint        name##_step_y,   \
1000     uint        name##_offset_first_element_in_bytes
1001 
1002 #define TENSOR3D_DECLARATION(name)   \
1003     __global uchar *name##_ptr,      \
1004     uint        name##_stride_x, \
1005     uint        name##_step_x,   \
1006     uint        name##_stride_y, \
1007     uint        name##_step_y,   \
1008     uint        name##_stride_z, \
1009     uint        name##_step_z,   \
1010     uint        name##_offset_first_element_in_bytes
1011 
1012 #define TENSOR4D_DECLARATION(name)   \
1013     __global uchar *name##_ptr,      \
1014     uint        name##_stride_x, \
1015     uint        name##_step_x,   \
1016     uint        name##_stride_y, \
1017     uint        name##_step_y,   \
1018     uint        name##_stride_z, \
1019     uint        name##_step_z,   \
1020     uint        name##_stride_w, \
1021     uint        name##_step_w,   \
1022     uint        name##_offset_first_element_in_bytes
1023 
1024 #define TENSOR5D_DECLARATION(name)   \
1025     __global uchar *name##_ptr,      \
1026     uint        name##_stride_x, \
1027     uint        name##_step_x,   \
1028     uint        name##_stride_y, \
1029     uint        name##_step_y,   \
1030     uint        name##_stride_z, \
1031     uint        name##_step_z,   \
1032     uint        name##_stride_w, \
1033     uint        name##_step_w,   \
1034     uint        name##_stride_v, \
1035     uint        name##_step_v,   \
1036     uint        name##_offset_first_element_in_bytes
1037 
1038 #define CONVERT_TO_VECTOR_STRUCT(name) \
1039     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1040 
1041 #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1042     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1043 
1044 #define CONVERT_TO_IMAGE_STRUCT(name) \
1045     update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1046 
1047 #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1048     update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1049 
1050 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1051     update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1052 
1053 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1054     update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1055 
1056 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1057     update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1058 
1059 #define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1060     update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1061                                  name##_stride_z, name##_step_z)
1062 
1063 #define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1064     update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1065 
1066 #define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1067     update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1068                                  name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1069 
1070 #define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1071     update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1072 
1073 #define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1074     tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1075                            name##_stride_z, name##_step_z)
1076 
1077 
1078 typedef struct Vector
1079 {
1080     __global uchar *ptr;
1081     int             offset_first_element_in_bytes;
1082     int             stride_x;
1083 } Vector;
1084 
1085 
1086 typedef struct Image
1087 {
1088     __global uchar *ptr;
1089     int             offset_first_element_in_bytes;
1090     int             stride_x;
1091     int             stride_y;
1092 } Image;
1093 
1094 
1095 typedef struct Tensor3D
1096 {
1097     __global uchar *ptr;
1098     int             offset_first_element_in_bytes;
1099     int             stride_x;
1100     int             stride_y;
1101     int             stride_z;
1102 } Tensor3D;
1103 
1104 
1105 typedef struct Tensor4D
1106 {
1107     __global uchar *ptr;
1108     int             offset_first_element_in_bytes;
1109     int             stride_x;
1110     int             stride_y;
1111     int             stride_z;
1112     int             stride_w;
1113 } Tensor4D;
1114 
1115 
1116 inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1117 {
1118     Vector vector =
1119     {
1120         .ptr                           = ptr,
1121         .offset_first_element_in_bytes = offset_first_element_in_bytes,
1122         .stride_x                      = stride_x,
1123     };
1124     vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1125     return vector;
1126 }
1127 
1128 
1129 inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1130 {
1131     Image img =
1132     {
1133         .ptr                           = ptr,
1134         .offset_first_element_in_bytes = offset_first_element_in_bytes,
1135         .stride_x                      = stride_x,
1136         .stride_y                      = stride_y
1137     };
1138     img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1139     return img;
1140 }
1141 
1142 
1143 inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1144 {
1145     Image img =
1146     {
1147         .ptr                           = ptr,
1148         .offset_first_element_in_bytes = offset_first_element_in_bytes,
1149         .stride_x                      = stride_x,
1150         .stride_y                      = stride_y
1151     };
1152     img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1153     return img;
1154 }
1155 
1156 
1157 inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1158 {
1159     Tensor3D tensor =
1160     {
1161         .ptr                           = ptr,
1162         .offset_first_element_in_bytes = offset_first_element_in_bytes,
1163         .stride_x                      = stride_x,
1164         .stride_y                      = stride_y,
1165         .stride_z                      = stride_z
1166     };
1167     tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1168     return tensor;
1169 }
1170 
1171 
1172 inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1173 {
1174     Tensor3D tensor =
1175     {
1176         .ptr                           = ptr,
1177         .offset_first_element_in_bytes = offset_first_element_in_bytes,
1178         .stride_x                      = stride_x,
1179         .stride_y                      = stride_y,
1180         .stride_z                      = stride_z
1181     };
1182     return tensor;
1183 }
1184 
1185 inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1186                                              uint step_w,
1187                                              uint mod_size)
1188 {
1189     Tensor4D tensor =
1190     {
1191         .ptr                           = ptr,
1192         .offset_first_element_in_bytes = offset_first_element_in_bytes,
1193         .stride_x                      = stride_x,
1194         .stride_y                      = stride_y,
1195         .stride_z                      = stride_z,
1196         .stride_w                      = stride_w
1197     };
1198 
1199     tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1200     return tensor;
1201 }
1202 
1203 
1204 inline __global const uchar *vector_offset(const Vector *vec, int x)
1205 {
1206     return vec->ptr + x * vec->stride_x;
1207 }
1208 
1209 
1210 inline __global uchar *offset(const Image *img, int x, int y)
1211 {
1212     return img->ptr + x * img->stride_x + y * img->stride_y;
1213 }
1214 
1215 
1216 inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1217 {
1218     return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1219 }
1220 
1221 
1222 inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1223 {
1224     return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1225 }
1226 
1227 
1228 inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1229 {
1230     uint num_elements = width * height;
1231 
1232     const uint z = index / num_elements;
1233 
1234     index %= num_elements;
1235 
1236     const uint y = index / width;
1237 
1238     index %= width;
1239 
1240     const uint x = index;
1241 
1242     return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1243 }
1244 
1245 #endif
1246 
1247 #if defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE) && defined(VEC_SIZE_LEFTOVER)
1248 
1249 #if !defined(OFFSET_OUT) && !defined(SCALE_OUT)
1250 
1251 #if VECTOR_SIZE == 2
1252 #define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 2))(0, STEP))
1253 #elif VECTOR_SIZE == 3
1254 #define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 3))(0, STEP, 2 * STEP))
1255 #elif VECTOR_SIZE == 4
1256 #define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 4))(0, STEP, 2 * STEP, 3 * STEP))
1257 #elif VECTOR_SIZE == 8
1258 #define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 8))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP))
1259 #elif VECTOR_SIZE == 16
1260 #define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 16))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP, 8 * STEP, 9 * STEP, 10 * STEP, 11 * STEP, 12 * STEP, 13 * STEP, 14 * STEP, 15 * STEP))
1261 #endif
1262 
1263 
1264 __kernel void range(
1265     VECTOR_DECLARATION(out))
1266 {
1267     uint     id             = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VEC_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
1268     __global uchar *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
1269 #if VECTOR_SIZE == 1
1270     DATA_TYPE seq;
1271     seq = (DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP;
1272 
1273     *(__global DATA_TYPE *)dst_ptr = seq;
1274 #else
1275     VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
1276     seq0 = ((DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP);
1277     seq0 = seq0 + STEP_VEC;
1278     STORE_VECTOR_SELECT(seq, DATA_TYPE, dst_ptr, VECTOR_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
1279 #endif
1280 }
1281 
1282 #else
1283 
1284 #if VECTOR_SIZE == 2
1285 #define STEP_VEC ((VEC_DATA_TYPE(float, 2))(0, STEP))
1286 #elif VECTOR_SIZE == 3
1287 #define STEP_VEC ((VEC_DATA_TYPE(float, 3))(0, STEP, 2 * STEP))
1288 #elif VECTOR_SIZE == 4
1289 #define STEP_VEC ((VEC_DATA_TYPE(float, 4))(0, STEP, 2 * STEP, 3 * STEP))
1290 #elif VECTOR_SIZE == 8
1291 #define STEP_VEC ((VEC_DATA_TYPE(float, 8))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP))
1292 #elif VECTOR_SIZE == 16
1293 #define STEP_VEC ((VEC_DATA_TYPE(float, 16))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP, 8 * STEP, 9 * STEP, 10 * STEP, 11 * STEP, 12 * STEP, 13 * STEP, 14 * STEP, 15 * STEP))
1294 #endif
1295 
1296 #define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
1297 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
1298 
1299 
1300 __kernel void range_quantized(
1301     VECTOR_DECLARATION(out))
1302 {
1303     uint     id             = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VEC_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
1304     __global uchar *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
1305 #if VECTOR_SIZE == 1
1306     float           seq;
1307     seq                            = (float)START + (float)id * (float)STEP;
1308     seq                            = (DATA_TYPE)(int)(seq / ((float)SCALE_OUT) + (float)OFFSET_OUT);
1309     seq                            = max(0.0f, min(seq, 255.0f));
1310     *(__global DATA_TYPE *)dst_ptr = CONVERT_SAT(CONVERT_DOWN(seq, int), DATA_TYPE);
1311 #else
1312     VEC_DATA_TYPE(float, VECTOR_SIZE)
1313     seq = (float)START + id * (float)STEP;
1314     seq = seq + STEP_VEC;
1315     seq = seq / ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)SCALE_OUT)) + ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)OFFSET_OUT));
1316     seq = max((VEC_DATA_TYPE(float, VECTOR_SIZE))(0.0f), min(seq, (VEC_DATA_TYPE(float, VECTOR_SIZE))(255.0f)));
1317     VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
1318     res0 = CONVERT_SAT(CONVERT_DOWN(seq, VEC_DATA_TYPE(int, VECTOR_SIZE)), VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
1319     STORE_VECTOR_SELECT(res, DATA_TYPE, dst_ptr, VECTOR_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
1320 #endif
1321 }
1322 #endif
1323 
1324 #endif  )"