Lines Matching refs:C

91         $for C in range(4, CHANNEL_TILE, 4):
92 float32x4_t vo0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
94 $for C in range(0, CHANNEL_TILE, 4):
95 float32x4_t vo${Y}c${ABC[C:C+4]} = vo0c${ABC[C:C+4]};
97 $for C in range(0, CHANNEL_TILE, 4):
98 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
100 $for C in range(0, CHANNEL_TILE, 4):
102 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_low_f…
104 $for C in range(0, CHANNEL_TILE, 4):
105 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
107 $for C in range(0, CHANNEL_TILE, 4):
109 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_low_f…
111 $for C in range(0, CHANNEL_TILE, 4):
112 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
114 $for C in range(0, CHANNEL_TILE, 4):
116 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_low_f…
118 $for C in range(0, CHANNEL_TILE, 4):
119 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
121 $for C in range(0, CHANNEL_TILE, 4):
123 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_low_f…
125 $for C in range(0, CHANNEL_TILE, 4):
126 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
128 $for C in range(0, CHANNEL_TILE, 4):
130 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_low_f…
132 $for C in range(0, CHANNEL_TILE, 4):
133 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
135 $for C in range(0, CHANNEL_TILE, 4):
137 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_low_f…
139 $for C in range(0, CHANNEL_TILE, 4):
140 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
142 $for C in range(0, CHANNEL_TILE, 4):
144 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_high_…
146 $for C in range(0, CHANNEL_TILE, 4):
147 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
149 $for C in range(0, CHANNEL_TILE, 4):
151 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_high_…
153 $for C in range(0, CHANNEL_TILE, 4):
154 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
156 $for C in range(0, CHANNEL_TILE, 4):
158 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_high_…
160 $for C in range(0, CHANNEL_TILE, 4):
161 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
163 $for C in range(0, CHANNEL_TILE, 4):
165 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_high_…
167 $for C in range(0, CHANNEL_TILE, 4):
168 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
170 $for C in range(0, CHANNEL_TILE, 4):
172 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_high_…
174 $for C in range(0, CHANNEL_TILE, 4):
175 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
177 $for C in range(0, CHANNEL_TILE, 4):
179 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_high_…
181 $for C in range(0, CHANNEL_TILE, 4):
182 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
188 $for C in range(0, CHANNEL_TILE, 4):
190 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_low_f…
192 $for C in range(0, CHANNEL_TILE, 4):
193 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
195 $for C in range(0, CHANNEL_TILE, 4):
197 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_low_f…
199 $for C in range(0, CHANNEL_TILE, 4):
200 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
202 $for C in range(0, CHANNEL_TILE, 4):
204 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_low_f…
206 $for C in range(0, CHANNEL_TILE, 4):
207 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
209 $for C in range(0, CHANNEL_TILE, 4):
211 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_low_f…
213 $for C in range(0, CHANNEL_TILE, 4):
214 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
216 $for C in range(0, CHANNEL_TILE, 4):
218 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_low_f…
220 $for C in range(0, CHANNEL_TILE, 4):
221 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
223 $for C in range(0, CHANNEL_TILE, 4):
225 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_low_f…
227 $for C in range(0, CHANNEL_TILE, 4):
228 const float32x4_t vk02c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 19});
230 $for C in range(0, CHANNEL_TILE, 4):
232 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c0x${ABC[C:C+4]}, vget_high_…
234 $for C in range(0, CHANNEL_TILE, 4):
235 const float32x4_t vk12c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 20});
237 $for C in range(0, CHANNEL_TILE, 4):
239 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c0x${ABC[C:C+4]}, vget_high_…
241 $for C in range(0, CHANNEL_TILE, 4):
242 const float32x4_t vk22c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 21});
244 $for C in range(0, CHANNEL_TILE, 4):
246 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c0x${ABC[C:C+4]}, vget_high_…
248 $for C in range(0, CHANNEL_TILE, 4):
249 const float32x4_t vk02c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 22});
251 $for C in range(0, CHANNEL_TILE, 4):
253 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c1x${ABC[C:C+4]}, vget_high_…
255 $for C in range(0, CHANNEL_TILE, 4):
256 const float32x4_t vk12c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 23});
258 $for C in range(0, CHANNEL_TILE, 4):
260 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c1x${ABC[C:C+4]}, vget_high_…
262 $for C in range(0, CHANNEL_TILE, 4):
263 const float32x4_t vk22c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 24});
265 $for C in range(0, CHANNEL_TILE, 4):
267 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c1x${ABC[C:C+4]}, vget_high_…
269 $for C in range(0, CHANNEL_TILE, 4):
270 const float32x4_t vk02c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 25});
276 $for C in range(0, CHANNEL_TILE, 4):
278 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c2x${ABC[C:C+4]}, vi${Y*2}x2…
280 $for C in range(0, CHANNEL_TILE, 4):
281 const float32x4_t vk12c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 26});
283 $for C in range(0, CHANNEL_TILE, 4):
285 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c2x${ABC[C:C+4]}, vi${Y*2+1}…
287 $for C in range(0, CHANNEL_TILE, 4):
288 const float32x4_t vk22c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 27});
290 $for C in range(0, CHANNEL_TILE, 4):
292 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c2x${ABC[C:C+4]}, vi${Y*2+2}…
301 $for C in range(0, CHANNEL_TILE, 4):
303 vo${Y}c${ABC[C:C+4]} = vmaxq_f32(vo${Y}c${ABC[C:C+4]}, vmin);
305 $for C in range(0, CHANNEL_TILE, 4):
307 vo${Y}c${ABC[C:C+4]} = vminq_f32(vo${Y}c${ABC[C:C+4]}, vmax);
312 $for C in range(4, CHANNEL_TILE, 4):
313 vst1q_f32(o${Y} + 4, vo${Y}c${ABC[C:C+4]});
325 $for C in range(0, 1 << (LOG2_CHANNEL_TILE - 1), 4):
327 vst1q_f32(o${Y}_tmp, vo${Y}c${ABC[C:C+4]}); o${Y}_tmp += 4;
328 … vo${Y}c${ABC[C:C+4]} = vo${Y}c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
345 $for C in range(4, CHANNEL_TILE, 4):
346 float32x4_t vo0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
348 $for C in range(0, CHANNEL_TILE, 4):
349 float32x4_t vo${Y}c${ABC[C:C+4]} = vo0c${ABC[C:C+4]};
351 $for C in range(0, CHANNEL_TILE, 4):
352 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
354 $for C in range(0, CHANNEL_TILE, 4):
356 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_low_f…
358 $for C in range(0, CHANNEL_TILE, 4):
359 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
361 $for C in range(0, CHANNEL_TILE, 4):
363 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_low_f…
365 $for C in range(0, CHANNEL_TILE, 4):
366 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
368 $for C in range(0, CHANNEL_TILE, 4):
370 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_low_f…
372 $for C in range(0, CHANNEL_TILE, 4):
373 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
375 $for C in range(0, CHANNEL_TILE, 4):
377 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_low_f…
379 $for C in range(0, CHANNEL_TILE, 4):
380 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
382 $for C in range(0, CHANNEL_TILE, 4):
384 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_low_f…
386 $for C in range(0, CHANNEL_TILE, 4):
387 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
389 $for C in range(0, CHANNEL_TILE, 4):
391 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_low_f…
393 $for C in range(0, CHANNEL_TILE, 4):
394 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
396 $for C in range(0, CHANNEL_TILE, 4):
398 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_high_…
400 $for C in range(0, CHANNEL_TILE, 4):
401 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
403 $for C in range(0, CHANNEL_TILE, 4):
405 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_high_…
407 $for C in range(0, CHANNEL_TILE, 4):
408 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
410 $for C in range(0, CHANNEL_TILE, 4):
412 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_high_…
414 $for C in range(0, CHANNEL_TILE, 4):
415 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
417 $for C in range(0, CHANNEL_TILE, 4):
419 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_high_…
421 $for C in range(0, CHANNEL_TILE, 4):
422 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
424 $for C in range(0, CHANNEL_TILE, 4):
426 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_high_…
428 $for C in range(0, CHANNEL_TILE, 4):
429 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
431 $for C in range(0, CHANNEL_TILE, 4):
433 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_high_…
435 $for C in range(0, CHANNEL_TILE, 4):
436 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
442 $for C in range(0, CHANNEL_TILE, 4):
444 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vi${Y*2}x1…
446 $for C in range(0, CHANNEL_TILE, 4):
447 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
449 $for C in range(0, CHANNEL_TILE, 4):
451 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vi${Y*2+1}…
453 $for C in range(0, CHANNEL_TILE, 4):
454 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
456 $for C in range(0, CHANNEL_TILE, 4):
458 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vi${Y*2+2}…
460 $for C in range(0, CHANNEL_TILE, 4):
461 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
463 $for C in range(0, CHANNEL_TILE, 4):
465 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vi${Y*2}x1…
467 $for C in range(0, CHANNEL_TILE, 4):
468 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
470 $for C in range(0, CHANNEL_TILE, 4):
472 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vi${Y*2+1}…
474 $for C in range(0, CHANNEL_TILE, 4):
475 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
477 $for C in range(0, CHANNEL_TILE, 4):
479 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vi${Y*2+2}…
485 $for C in range(0, CHANNEL_TILE, 4):
487 vo${Y}c${ABC[C:C+4]} = vmaxq_f32(vo${Y}c${ABC[C:C+4]}, vmin);
489 $for C in range(0, CHANNEL_TILE, 4):
491 vo${Y}c${ABC[C:C+4]} = vminq_f32(vo${Y}c${ABC[C:C+4]}, vmax);
496 $for C in range(4, CHANNEL_TILE, 4):
497 vst1q_f32(o${Y} + 4, vo${Y}c${ABC[C:C+4]});
509 $for C in range(0, 1 << (LOG2_CHANNEL_TILE - 1), 4):
511 vst1q_f32(o${Y}_tmp, vo${Y}c${ABC[C:C+4]}); o${Y}_tmp += 4;
512 … vo${Y}c${ABC[C:C+4]} = vo${Y}c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};